1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 8 { 9 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 10 PetscErrorCode ierr; 11 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 12 const PetscInt *idx; 13 PetscInt start,end,*ai,*aj,bs,*nidx2; 14 PetscBT table; 15 16 PetscFunctionBegin; 17 m = a->mbs; 18 ai = a->i; 19 aj = a->j; 20 bs = A->rmap->bs; 21 22 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 23 24 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 25 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 28 for (i=0; i<is_max; i++) { 29 /* Initialise the two local arrays */ 30 isz = 0; 31 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 38 for (j=0; j<n; ++j) { 39 ival = idx[j]/bs; /* convert the indices into block indices */ 40 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 41 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 42 } 43 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 44 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 45 46 k = 0; 47 for (j=0; j<ov; j++) { /* for each overlap*/ 48 n = isz; 49 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 50 row = nidx[k]; 51 start = ai[row]; 52 end = ai[row+1]; 53 for (l = start; l<end; l++) { 54 val = aj[l]; 55 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 56 } 57 } 58 } 59 /* expand the Index Set */ 60 for (j=0; j<isz; j++) { 61 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 62 } 63 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 64 } 65 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 66 ierr = PetscFree(nidx);CHKERRQ(ierr); 67 ierr = PetscFree(nidx2);CHKERRQ(ierr); 68 PetscFunctionReturn(0); 69 } 70 71 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 72 { 73 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 74 PetscErrorCode ierr; 75 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 76 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 77 const PetscInt *irow,*icol; 78 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 79 PetscInt *aj = a->j,*ai = a->i; 80 MatScalar *mat_a; 81 Mat C; 82 PetscBool flag; 83 84 PetscFunctionBegin; 85 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 86 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 87 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 88 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 89 90 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 91 ssmap = smap; 92 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 93 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 94 /* determine lens of each row */ 95 for (i=0; i<nrows; i++) { 96 kstart = ai[irow[i]]; 97 kend = kstart + a->ilen[irow[i]]; 98 lens[i] = 0; 99 for (k=kstart; k<kend; k++) { 100 if (ssmap[aj[k]]) lens[i]++; 101 } 102 } 103 /* Create and fill new matrix */ 104 if (scall == MAT_REUSE_MATRIX) { 105 c = (Mat_SeqBAIJ*)((*B)->data); 106 107 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 108 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 109 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 110 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 111 C = *B; 112 } else { 113 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 114 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 115 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 116 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 117 } 118 c = (Mat_SeqBAIJ*)(C->data); 119 for (i=0; i<nrows; i++) { 120 row = irow[i]; 121 kstart = ai[row]; 122 kend = kstart + a->ilen[row]; 123 mat_i = c->i[i]; 124 mat_j = c->j + mat_i; 125 mat_a = c->a + mat_i*bs2; 126 mat_ilen = c->ilen + i; 127 for (k=kstart; k<kend; k++) { 128 if ((tcol=ssmap[a->j[k]])) { 129 *mat_j++ = tcol - 1; 130 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 131 mat_a += bs2; 132 (*mat_ilen)++; 133 } 134 } 135 } 136 /* sort */ 137 { 138 MatScalar *work; 139 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 140 for (i=0; i<nrows; i++) { 141 PetscInt ilen; 142 mat_i = c->i[i]; 143 mat_j = c->j + mat_i; 144 mat_a = c->a + mat_i*bs2; 145 ilen = c->ilen[i]; 146 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 147 } 148 ierr = PetscFree(work);CHKERRQ(ierr); 149 } 150 151 /* Free work space */ 152 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 153 ierr = PetscFree(smap);CHKERRQ(ierr); 154 ierr = PetscFree(lens);CHKERRQ(ierr); 155 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 156 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 157 158 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 159 *B = C; 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 164 { 165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 166 IS is1,is2; 167 PetscErrorCode ierr; 168 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 169 const PetscInt *irow,*icol; 170 171 PetscFunctionBegin; 172 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 173 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 176 177 /* Verify if the indices corespond to each element in a block 178 and form the IS with compressed IS */ 179 maxmnbs = PetscMax(a->mbs,a->nbs); 180 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 181 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 182 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 183 for (i=0; i<a->mbs; i++) { 184 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 185 } 186 count = 0; 187 for (i=0; i<nrows; i++) { 188 j = irow[i] / bs; 189 if ((vary[j]--)==bs) iary[count++] = j; 190 } 191 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 192 193 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 194 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 195 for (i=0; i<a->nbs; i++) { 196 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 197 } 198 count = 0; 199 for (i=0; i<ncols; i++) { 200 j = icol[i] / bs; 201 if ((vary[j]--)==bs) iary[count++] = j; 202 } 203 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 204 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 205 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 206 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 207 208 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 209 ierr = ISDestroy(&is1);CHKERRQ(ierr); 210 ierr = ISDestroy(&is2);CHKERRQ(ierr); 211 PetscFunctionReturn(0); 212 } 213 214 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C) 215 { 216 PetscErrorCode ierr; 217 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 218 Mat_SubSppt *submatj = c->submatis1; 219 220 PetscFunctionBegin; 221 ierr = submatj->destroy(C);CHKERRQ(ierr); 222 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[]) 227 { 228 PetscErrorCode ierr; 229 PetscInt i; 230 Mat C; 231 Mat_SeqBAIJ *c; 232 Mat_SubSppt *submatj; 233 234 PetscFunctionBegin; 235 for (i=0; i<n; i++) { 236 C = (*mat)[i]; 237 c = (Mat_SeqBAIJ*)C->data; 238 submatj = c->submatis1; 239 if (submatj) { 240 ierr = submatj->destroy(C);CHKERRQ(ierr); 241 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr); 242 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr); 243 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr); 244 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr); 245 } else { 246 ierr = MatDestroy(&C);CHKERRQ(ierr); 247 } 248 } 249 ierr = PetscFree(*mat);CHKERRQ(ierr); 250 PetscFunctionReturn(0); 251 } 252 253 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 254 { 255 PetscErrorCode ierr; 256 PetscInt i; 257 258 PetscFunctionBegin; 259 if (scall == MAT_INITIAL_MATRIX) { 260 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 261 } 262 263 for (i=0; i<n; i++) { 264 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 265 } 266 PetscFunctionReturn(0); 267 } 268 269 /* -------------------------------------------------------*/ 270 /* Should check that shapes of vectors and matrices match */ 271 /* -------------------------------------------------------*/ 272 273 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 274 { 275 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 276 PetscScalar *z,sum; 277 const PetscScalar *x; 278 const MatScalar *v; 279 PetscErrorCode ierr; 280 PetscInt mbs,i,n; 281 const PetscInt *idx,*ii,*ridx=NULL; 282 PetscBool usecprow=a->compressedrow.use; 283 284 PetscFunctionBegin; 285 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 286 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 287 288 if (usecprow) { 289 mbs = a->compressedrow.nrows; 290 ii = a->compressedrow.i; 291 ridx = a->compressedrow.rindex; 292 ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 293 } else { 294 mbs = a->mbs; 295 ii = a->i; 296 } 297 298 for (i=0; i<mbs; i++) { 299 n = ii[1] - ii[0]; 300 v = a->a + ii[0]; 301 idx = a->j + ii[0]; 302 ii++; 303 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 304 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 305 sum = 0.0; 306 PetscSparseDensePlusDot(sum,x,v,idx,n); 307 if (usecprow) { 308 z[ridx[i]] = sum; 309 } else { 310 z[i] = sum; 311 } 312 } 313 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 314 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 315 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 316 PetscFunctionReturn(0); 317 } 318 319 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 320 { 321 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 322 PetscScalar *z = 0,sum1,sum2,*zarray; 323 const PetscScalar *x,*xb; 324 PetscScalar x1,x2; 325 const MatScalar *v; 326 PetscErrorCode ierr; 327 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 328 PetscBool usecprow=a->compressedrow.use; 329 330 PetscFunctionBegin; 331 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 332 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 333 334 idx = a->j; 335 v = a->a; 336 if (usecprow) { 337 mbs = a->compressedrow.nrows; 338 ii = a->compressedrow.i; 339 ridx = a->compressedrow.rindex; 340 ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 341 } else { 342 mbs = a->mbs; 343 ii = a->i; 344 z = zarray; 345 } 346 347 for (i=0; i<mbs; i++) { 348 n = ii[1] - ii[0]; ii++; 349 sum1 = 0.0; sum2 = 0.0; 350 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 351 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 352 for (j=0; j<n; j++) { 353 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 354 sum1 += v[0]*x1 + v[2]*x2; 355 sum2 += v[1]*x1 + v[3]*x2; 356 v += 4; 357 } 358 if (usecprow) z = zarray + 2*ridx[i]; 359 z[0] = sum1; z[1] = sum2; 360 if (!usecprow) z += 2; 361 } 362 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 363 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 364 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 365 PetscFunctionReturn(0); 366 } 367 368 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 369 { 370 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 371 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 372 const PetscScalar *x,*xb; 373 const MatScalar *v; 374 PetscErrorCode ierr; 375 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 376 PetscBool usecprow=a->compressedrow.use; 377 378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 379 #pragma disjoint(*v,*z,*xb) 380 #endif 381 382 PetscFunctionBegin; 383 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 384 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 385 386 idx = a->j; 387 v = a->a; 388 if (usecprow) { 389 mbs = a->compressedrow.nrows; 390 ii = a->compressedrow.i; 391 ridx = a->compressedrow.rindex; 392 ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 393 } else { 394 mbs = a->mbs; 395 ii = a->i; 396 z = zarray; 397 } 398 399 for (i=0; i<mbs; i++) { 400 n = ii[1] - ii[0]; ii++; 401 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 402 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 403 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 404 for (j=0; j<n; j++) { 405 xb = x + 3*(*idx++); 406 x1 = xb[0]; 407 x2 = xb[1]; 408 x3 = xb[2]; 409 410 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 411 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 412 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 413 v += 9; 414 } 415 if (usecprow) z = zarray + 3*ridx[i]; 416 z[0] = sum1; z[1] = sum2; z[2] = sum3; 417 if (!usecprow) z += 3; 418 } 419 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 420 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 421 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 422 PetscFunctionReturn(0); 423 } 424 425 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 426 { 427 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 428 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 429 const PetscScalar *x,*xb; 430 const MatScalar *v; 431 PetscErrorCode ierr; 432 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 433 PetscBool usecprow=a->compressedrow.use; 434 435 PetscFunctionBegin; 436 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 437 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 438 439 idx = a->j; 440 v = a->a; 441 if (usecprow) { 442 mbs = a->compressedrow.nrows; 443 ii = a->compressedrow.i; 444 ridx = a->compressedrow.rindex; 445 ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 446 } else { 447 mbs = a->mbs; 448 ii = a->i; 449 z = zarray; 450 } 451 452 for (i=0; i<mbs; i++) { 453 n = ii[1] - ii[0]; 454 ii++; 455 sum1 = 0.0; 456 sum2 = 0.0; 457 sum3 = 0.0; 458 sum4 = 0.0; 459 460 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 461 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 462 for (j=0; j<n; j++) { 463 xb = x + 4*(*idx++); 464 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 465 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 466 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 467 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 468 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 469 v += 16; 470 } 471 if (usecprow) z = zarray + 4*ridx[i]; 472 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 473 if (!usecprow) z += 4; 474 } 475 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 476 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 477 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 478 PetscFunctionReturn(0); 479 } 480 481 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 482 { 483 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 484 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 485 const PetscScalar *xb,*x; 486 const MatScalar *v; 487 PetscErrorCode ierr; 488 const PetscInt *idx,*ii,*ridx=NULL; 489 PetscInt mbs,i,j,n; 490 PetscBool usecprow=a->compressedrow.use; 491 492 PetscFunctionBegin; 493 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 494 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 495 496 idx = a->j; 497 v = a->a; 498 if (usecprow) { 499 mbs = a->compressedrow.nrows; 500 ii = a->compressedrow.i; 501 ridx = a->compressedrow.rindex; 502 ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 503 } else { 504 mbs = a->mbs; 505 ii = a->i; 506 z = zarray; 507 } 508 509 for (i=0; i<mbs; i++) { 510 n = ii[1] - ii[0]; ii++; 511 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 512 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 513 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 514 for (j=0; j<n; j++) { 515 xb = x + 5*(*idx++); 516 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 517 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 518 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 519 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 520 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 521 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 522 v += 25; 523 } 524 if (usecprow) z = zarray + 5*ridx[i]; 525 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 526 if (!usecprow) z += 5; 527 } 528 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 529 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 530 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 531 PetscFunctionReturn(0); 532 } 533 534 535 536 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 537 { 538 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 539 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 540 const PetscScalar *x,*xb; 541 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 542 const MatScalar *v; 543 PetscErrorCode ierr; 544 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 545 PetscBool usecprow=a->compressedrow.use; 546 547 PetscFunctionBegin; 548 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 549 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 550 551 idx = a->j; 552 v = a->a; 553 if (usecprow) { 554 mbs = a->compressedrow.nrows; 555 ii = a->compressedrow.i; 556 ridx = a->compressedrow.rindex; 557 ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 558 } else { 559 mbs = a->mbs; 560 ii = a->i; 561 z = zarray; 562 } 563 564 for (i=0; i<mbs; i++) { 565 n = ii[1] - ii[0]; 566 ii++; 567 sum1 = 0.0; 568 sum2 = 0.0; 569 sum3 = 0.0; 570 sum4 = 0.0; 571 sum5 = 0.0; 572 sum6 = 0.0; 573 574 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 575 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 576 for (j=0; j<n; j++) { 577 xb = x + 6*(*idx++); 578 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 579 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 580 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 581 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 582 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 583 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 584 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 585 v += 36; 586 } 587 if (usecprow) z = zarray + 6*ridx[i]; 588 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 589 if (!usecprow) z += 6; 590 } 591 592 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 593 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 594 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 595 PetscFunctionReturn(0); 596 } 597 598 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 599 { 600 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 601 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 602 const PetscScalar *x,*xb; 603 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 604 const MatScalar *v; 605 PetscErrorCode ierr; 606 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 607 PetscBool usecprow=a->compressedrow.use; 608 609 PetscFunctionBegin; 610 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 611 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 612 613 idx = a->j; 614 v = a->a; 615 if (usecprow) { 616 mbs = a->compressedrow.nrows; 617 ii = a->compressedrow.i; 618 ridx = a->compressedrow.rindex; 619 ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 620 } else { 621 mbs = a->mbs; 622 ii = a->i; 623 z = zarray; 624 } 625 626 for (i=0; i<mbs; i++) { 627 n = ii[1] - ii[0]; 628 ii++; 629 sum1 = 0.0; 630 sum2 = 0.0; 631 sum3 = 0.0; 632 sum4 = 0.0; 633 sum5 = 0.0; 634 sum6 = 0.0; 635 sum7 = 0.0; 636 637 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 638 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 639 for (j=0; j<n; j++) { 640 xb = x + 7*(*idx++); 641 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 642 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 643 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 644 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 645 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 646 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 647 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 648 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 649 v += 49; 650 } 651 if (usecprow) z = zarray + 7*ridx[i]; 652 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 653 if (!usecprow) z += 7; 654 } 655 656 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 657 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 658 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 659 PetscFunctionReturn(0); 660 } 661 662 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 663 { 664 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 665 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 666 const PetscScalar *x,*xb; 667 PetscScalar *zarray,xv; 668 const MatScalar *v; 669 PetscErrorCode ierr; 670 const PetscInt *ii,*ij=a->j,*idx; 671 PetscInt mbs,i,j,k,n,*ridx=NULL; 672 PetscBool usecprow=a->compressedrow.use; 673 674 PetscFunctionBegin; 675 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 676 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 677 678 v = a->a; 679 if (usecprow) { 680 mbs = a->compressedrow.nrows; 681 ii = a->compressedrow.i; 682 ridx = a->compressedrow.rindex; 683 ierr = PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 684 } else { 685 mbs = a->mbs; 686 ii = a->i; 687 z = zarray; 688 } 689 690 for (i=0; i<mbs; i++) { 691 n = ii[i+1] - ii[i]; 692 idx = ij + ii[i]; 693 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 694 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 695 696 for (j=0; j<n; j++) { 697 xb = x + 11*(idx[j]); 698 699 for (k=0; k<11; k++) { 700 xv = xb[k]; 701 sum1 += v[0]*xv; 702 sum2 += v[1]*xv; 703 sum3 += v[2]*xv; 704 sum4 += v[3]*xv; 705 sum5 += v[4]*xv; 706 sum6 += v[5]*xv; 707 sum7 += v[6]*xv; 708 sum8 += v[7]*xv; 709 sum9 += v[8]*xv; 710 sum10 += v[9]*xv; 711 sum11 += v[10]*xv; 712 v += 11; 713 } 714 } 715 if (usecprow) z = zarray + 11*ridx[i]; 716 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 717 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 718 719 if (!usecprow) z += 11; 720 } 721 722 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 723 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 724 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 725 PetscFunctionReturn(0); 726 } 727 728 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 729 /* Default MatMult for block size 15 */ 730 731 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 732 { 733 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 734 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 735 const PetscScalar *x,*xb; 736 PetscScalar *zarray,xv; 737 const MatScalar *v; 738 PetscErrorCode ierr; 739 const PetscInt *ii,*ij=a->j,*idx; 740 PetscInt mbs,i,j,k,n,*ridx=NULL; 741 PetscBool usecprow=a->compressedrow.use; 742 743 PetscFunctionBegin; 744 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 745 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 746 747 v = a->a; 748 if (usecprow) { 749 mbs = a->compressedrow.nrows; 750 ii = a->compressedrow.i; 751 ridx = a->compressedrow.rindex; 752 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 753 } else { 754 mbs = a->mbs; 755 ii = a->i; 756 z = zarray; 757 } 758 759 for (i=0; i<mbs; i++) { 760 n = ii[i+1] - ii[i]; 761 idx = ij + ii[i]; 762 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 763 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 764 765 for (j=0; j<n; j++) { 766 xb = x + 15*(idx[j]); 767 768 for (k=0; k<15; k++) { 769 xv = xb[k]; 770 sum1 += v[0]*xv; 771 sum2 += v[1]*xv; 772 sum3 += v[2]*xv; 773 sum4 += v[3]*xv; 774 sum5 += v[4]*xv; 775 sum6 += v[5]*xv; 776 sum7 += v[6]*xv; 777 sum8 += v[7]*xv; 778 sum9 += v[8]*xv; 779 sum10 += v[9]*xv; 780 sum11 += v[10]*xv; 781 sum12 += v[11]*xv; 782 sum13 += v[12]*xv; 783 sum14 += v[13]*xv; 784 sum15 += v[14]*xv; 785 v += 15; 786 } 787 } 788 if (usecprow) z = zarray + 15*ridx[i]; 789 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 790 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 791 792 if (!usecprow) z += 15; 793 } 794 795 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 796 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 797 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 798 PetscFunctionReturn(0); 799 } 800 801 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 802 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 803 { 804 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 805 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 806 const PetscScalar *x,*xb; 807 PetscScalar x1,x2,x3,x4,*zarray; 808 const MatScalar *v; 809 PetscErrorCode ierr; 810 const PetscInt *ii,*ij=a->j,*idx; 811 PetscInt mbs,i,j,n,*ridx=NULL; 812 PetscBool usecprow=a->compressedrow.use; 813 814 PetscFunctionBegin; 815 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 816 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 817 818 v = a->a; 819 if (usecprow) { 820 mbs = a->compressedrow.nrows; 821 ii = a->compressedrow.i; 822 ridx = a->compressedrow.rindex; 823 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 824 } else { 825 mbs = a->mbs; 826 ii = a->i; 827 z = zarray; 828 } 829 830 for (i=0; i<mbs; i++) { 831 n = ii[i+1] - ii[i]; 832 idx = ij + ii[i]; 833 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 834 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 835 836 for (j=0; j<n; j++) { 837 xb = x + 15*(idx[j]); 838 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 839 840 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 841 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 842 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 843 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 844 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 845 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 846 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 847 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 848 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 849 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 850 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 851 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 852 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 853 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 854 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 855 856 v += 60; 857 858 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 859 860 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 861 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 862 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 863 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 864 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 865 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 866 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 867 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 868 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 869 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 870 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 871 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 872 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 873 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 874 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 875 v += 60; 876 877 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 878 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 879 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 880 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 881 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 882 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 883 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 884 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 885 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 886 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 887 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 888 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 889 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 890 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 891 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 892 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 893 v += 60; 894 895 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 896 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 897 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 898 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 899 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 900 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 901 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 902 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 903 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 904 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 905 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 906 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 907 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 908 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 909 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 910 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 911 v += 45; 912 } 913 if (usecprow) z = zarray + 15*ridx[i]; 914 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 915 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 916 917 if (!usecprow) z += 15; 918 } 919 920 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 921 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 922 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 923 PetscFunctionReturn(0); 924 } 925 926 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 927 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 928 { 929 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 930 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 931 const PetscScalar *x,*xb; 932 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 933 const MatScalar *v; 934 PetscErrorCode ierr; 935 const PetscInt *ii,*ij=a->j,*idx; 936 PetscInt mbs,i,j,n,*ridx=NULL; 937 PetscBool usecprow=a->compressedrow.use; 938 939 PetscFunctionBegin; 940 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 941 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 942 943 v = a->a; 944 if (usecprow) { 945 mbs = a->compressedrow.nrows; 946 ii = a->compressedrow.i; 947 ridx = a->compressedrow.rindex; 948 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 949 } else { 950 mbs = a->mbs; 951 ii = a->i; 952 z = zarray; 953 } 954 955 for (i=0; i<mbs; i++) { 956 n = ii[i+1] - ii[i]; 957 idx = ij + ii[i]; 958 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 959 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 960 961 for (j=0; j<n; j++) { 962 xb = x + 15*(idx[j]); 963 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 964 x8 = xb[7]; 965 966 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 967 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 968 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 969 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 970 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 971 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 972 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 973 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 974 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 975 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 976 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 977 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 978 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 979 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 980 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 981 v += 120; 982 983 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 984 985 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 986 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 987 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 988 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 989 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 990 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 991 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 992 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 993 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 994 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 995 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 996 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 997 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 998 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 999 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1000 v += 105; 1001 } 1002 if (usecprow) z = zarray + 15*ridx[i]; 1003 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1004 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1005 1006 if (!usecprow) z += 15; 1007 } 1008 1009 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1010 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1011 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1012 PetscFunctionReturn(0); 1013 } 1014 1015 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1016 1017 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1018 { 1019 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1020 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1021 const PetscScalar *x,*xb; 1022 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1023 const MatScalar *v; 1024 PetscErrorCode ierr; 1025 const PetscInt *ii,*ij=a->j,*idx; 1026 PetscInt mbs,i,j,n,*ridx=NULL; 1027 PetscBool usecprow=a->compressedrow.use; 1028 1029 PetscFunctionBegin; 1030 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1031 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1032 1033 v = a->a; 1034 if (usecprow) { 1035 mbs = a->compressedrow.nrows; 1036 ii = a->compressedrow.i; 1037 ridx = a->compressedrow.rindex; 1038 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1039 } else { 1040 mbs = a->mbs; 1041 ii = a->i; 1042 z = zarray; 1043 } 1044 1045 for (i=0; i<mbs; i++) { 1046 n = ii[i+1] - ii[i]; 1047 idx = ij + ii[i]; 1048 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1049 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1050 1051 for (j=0; j<n; j++) { 1052 xb = x + 15*(idx[j]); 1053 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1054 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1055 1056 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1057 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1058 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1059 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1060 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1061 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1062 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1063 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1064 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1065 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1066 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1067 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1068 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1069 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1070 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1071 v += 225; 1072 } 1073 if (usecprow) z = zarray + 15*ridx[i]; 1074 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1075 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1076 1077 if (!usecprow) z += 15; 1078 } 1079 1080 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1081 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1082 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1083 PetscFunctionReturn(0); 1084 } 1085 1086 1087 /* 1088 This will not work with MatScalar == float because it calls the BLAS 1089 */ 1090 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1091 { 1092 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1093 PetscScalar *z = 0,*work,*workt,*zarray; 1094 const PetscScalar *x,*xb; 1095 const MatScalar *v; 1096 PetscErrorCode ierr; 1097 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1098 const PetscInt *idx,*ii,*ridx=NULL; 1099 PetscInt ncols,k; 1100 PetscBool usecprow=a->compressedrow.use; 1101 1102 PetscFunctionBegin; 1103 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1104 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1105 1106 idx = a->j; 1107 v = a->a; 1108 if (usecprow) { 1109 mbs = a->compressedrow.nrows; 1110 ii = a->compressedrow.i; 1111 ridx = a->compressedrow.rindex; 1112 ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1113 } else { 1114 mbs = a->mbs; 1115 ii = a->i; 1116 z = zarray; 1117 } 1118 1119 if (!a->mult_work) { 1120 k = PetscMax(A->rmap->n,A->cmap->n); 1121 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1122 } 1123 work = a->mult_work; 1124 for (i=0; i<mbs; i++) { 1125 n = ii[1] - ii[0]; ii++; 1126 ncols = n*bs; 1127 workt = work; 1128 for (j=0; j<n; j++) { 1129 xb = x + bs*(*idx++); 1130 for (k=0; k<bs; k++) workt[k] = xb[k]; 1131 workt += bs; 1132 } 1133 if (usecprow) z = zarray + bs*ridx[i]; 1134 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1135 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1136 v += n*bs2; 1137 if (!usecprow) z += bs; 1138 } 1139 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1140 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1141 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1142 PetscFunctionReturn(0); 1143 } 1144 1145 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1146 { 1147 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1148 const PetscScalar *x; 1149 PetscScalar *y,*z,sum; 1150 const MatScalar *v; 1151 PetscErrorCode ierr; 1152 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1153 const PetscInt *idx,*ii; 1154 PetscBool usecprow=a->compressedrow.use; 1155 1156 PetscFunctionBegin; 1157 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1158 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1159 1160 idx = a->j; 1161 v = a->a; 1162 if (usecprow) { 1163 if (zz != yy) { 1164 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1165 } 1166 mbs = a->compressedrow.nrows; 1167 ii = a->compressedrow.i; 1168 ridx = a->compressedrow.rindex; 1169 } else { 1170 ii = a->i; 1171 } 1172 1173 for (i=0; i<mbs; i++) { 1174 n = ii[1] - ii[0]; 1175 ii++; 1176 if (!usecprow) { 1177 sum = y[i]; 1178 } else { 1179 sum = y[ridx[i]]; 1180 } 1181 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1182 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1183 PetscSparseDensePlusDot(sum,x,v,idx,n); 1184 v += n; 1185 idx += n; 1186 if (usecprow) { 1187 z[ridx[i]] = sum; 1188 } else { 1189 z[i] = sum; 1190 } 1191 } 1192 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1193 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1194 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1195 PetscFunctionReturn(0); 1196 } 1197 1198 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1199 { 1200 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1201 PetscScalar *y = 0,*z = 0,sum1,sum2; 1202 const PetscScalar *x,*xb; 1203 PetscScalar x1,x2,*yarray,*zarray; 1204 const MatScalar *v; 1205 PetscErrorCode ierr; 1206 PetscInt mbs = a->mbs,i,n,j; 1207 const PetscInt *idx,*ii,*ridx = NULL; 1208 PetscBool usecprow = a->compressedrow.use; 1209 1210 PetscFunctionBegin; 1211 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1212 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1213 1214 idx = a->j; 1215 v = a->a; 1216 if (usecprow) { 1217 if (zz != yy) { 1218 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1219 } 1220 mbs = a->compressedrow.nrows; 1221 ii = a->compressedrow.i; 1222 ridx = a->compressedrow.rindex; 1223 } else { 1224 ii = a->i; 1225 y = yarray; 1226 z = zarray; 1227 } 1228 1229 for (i=0; i<mbs; i++) { 1230 n = ii[1] - ii[0]; ii++; 1231 if (usecprow) { 1232 z = zarray + 2*ridx[i]; 1233 y = yarray + 2*ridx[i]; 1234 } 1235 sum1 = y[0]; sum2 = y[1]; 1236 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1237 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1238 for (j=0; j<n; j++) { 1239 xb = x + 2*(*idx++); 1240 x1 = xb[0]; 1241 x2 = xb[1]; 1242 1243 sum1 += v[0]*x1 + v[2]*x2; 1244 sum2 += v[1]*x1 + v[3]*x2; 1245 v += 4; 1246 } 1247 z[0] = sum1; z[1] = sum2; 1248 if (!usecprow) { 1249 z += 2; y += 2; 1250 } 1251 } 1252 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1253 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1254 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1255 PetscFunctionReturn(0); 1256 } 1257 1258 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1259 { 1260 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1261 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1262 const PetscScalar *x,*xb; 1263 const MatScalar *v; 1264 PetscErrorCode ierr; 1265 PetscInt mbs = a->mbs,i,j,n; 1266 const PetscInt *idx,*ii,*ridx = NULL; 1267 PetscBool usecprow = a->compressedrow.use; 1268 1269 PetscFunctionBegin; 1270 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1271 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1272 1273 idx = a->j; 1274 v = a->a; 1275 if (usecprow) { 1276 if (zz != yy) { 1277 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1278 } 1279 mbs = a->compressedrow.nrows; 1280 ii = a->compressedrow.i; 1281 ridx = a->compressedrow.rindex; 1282 } else { 1283 ii = a->i; 1284 y = yarray; 1285 z = zarray; 1286 } 1287 1288 for (i=0; i<mbs; i++) { 1289 n = ii[1] - ii[0]; ii++; 1290 if (usecprow) { 1291 z = zarray + 3*ridx[i]; 1292 y = yarray + 3*ridx[i]; 1293 } 1294 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1295 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1296 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1297 for (j=0; j<n; j++) { 1298 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1299 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1300 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1301 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1302 v += 9; 1303 } 1304 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1305 if (!usecprow) { 1306 z += 3; y += 3; 1307 } 1308 } 1309 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1310 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1311 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1312 PetscFunctionReturn(0); 1313 } 1314 1315 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1316 { 1317 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1318 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1319 const PetscScalar *x,*xb; 1320 const MatScalar *v; 1321 PetscErrorCode ierr; 1322 PetscInt mbs = a->mbs,i,j,n; 1323 const PetscInt *idx,*ii,*ridx=NULL; 1324 PetscBool usecprow=a->compressedrow.use; 1325 1326 PetscFunctionBegin; 1327 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1328 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1329 1330 idx = a->j; 1331 v = a->a; 1332 if (usecprow) { 1333 if (zz != yy) { 1334 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1335 } 1336 mbs = a->compressedrow.nrows; 1337 ii = a->compressedrow.i; 1338 ridx = a->compressedrow.rindex; 1339 } else { 1340 ii = a->i; 1341 y = yarray; 1342 z = zarray; 1343 } 1344 1345 for (i=0; i<mbs; i++) { 1346 n = ii[1] - ii[0]; ii++; 1347 if (usecprow) { 1348 z = zarray + 4*ridx[i]; 1349 y = yarray + 4*ridx[i]; 1350 } 1351 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1352 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1353 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1354 for (j=0; j<n; j++) { 1355 xb = x + 4*(*idx++); 1356 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1357 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1358 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1359 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1360 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1361 v += 16; 1362 } 1363 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1364 if (!usecprow) { 1365 z += 4; y += 4; 1366 } 1367 } 1368 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1369 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1370 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1371 PetscFunctionReturn(0); 1372 } 1373 1374 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1375 { 1376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1377 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1378 const PetscScalar *x,*xb; 1379 PetscScalar *yarray,*zarray; 1380 const MatScalar *v; 1381 PetscErrorCode ierr; 1382 PetscInt mbs = a->mbs,i,j,n; 1383 const PetscInt *idx,*ii,*ridx = NULL; 1384 PetscBool usecprow=a->compressedrow.use; 1385 1386 PetscFunctionBegin; 1387 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1388 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1389 1390 idx = a->j; 1391 v = a->a; 1392 if (usecprow) { 1393 if (zz != yy) { 1394 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1395 } 1396 mbs = a->compressedrow.nrows; 1397 ii = a->compressedrow.i; 1398 ridx = a->compressedrow.rindex; 1399 } else { 1400 ii = a->i; 1401 y = yarray; 1402 z = zarray; 1403 } 1404 1405 for (i=0; i<mbs; i++) { 1406 n = ii[1] - ii[0]; ii++; 1407 if (usecprow) { 1408 z = zarray + 5*ridx[i]; 1409 y = yarray + 5*ridx[i]; 1410 } 1411 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1412 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1413 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1414 for (j=0; j<n; j++) { 1415 xb = x + 5*(*idx++); 1416 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1417 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1418 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1419 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1420 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1421 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1422 v += 25; 1423 } 1424 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1425 if (!usecprow) { 1426 z += 5; y += 5; 1427 } 1428 } 1429 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1430 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1431 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1432 PetscFunctionReturn(0); 1433 } 1434 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1435 { 1436 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1437 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1438 const PetscScalar *x,*xb; 1439 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1440 const MatScalar *v; 1441 PetscErrorCode ierr; 1442 PetscInt mbs = a->mbs,i,j,n; 1443 const PetscInt *idx,*ii,*ridx=NULL; 1444 PetscBool usecprow=a->compressedrow.use; 1445 1446 PetscFunctionBegin; 1447 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1448 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1449 1450 idx = a->j; 1451 v = a->a; 1452 if (usecprow) { 1453 if (zz != yy) { 1454 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1455 } 1456 mbs = a->compressedrow.nrows; 1457 ii = a->compressedrow.i; 1458 ridx = a->compressedrow.rindex; 1459 } else { 1460 ii = a->i; 1461 y = yarray; 1462 z = zarray; 1463 } 1464 1465 for (i=0; i<mbs; i++) { 1466 n = ii[1] - ii[0]; ii++; 1467 if (usecprow) { 1468 z = zarray + 6*ridx[i]; 1469 y = yarray + 6*ridx[i]; 1470 } 1471 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1472 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1473 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1474 for (j=0; j<n; j++) { 1475 xb = x + 6*(*idx++); 1476 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1477 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1478 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1479 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1480 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1481 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1482 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1483 v += 36; 1484 } 1485 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1486 if (!usecprow) { 1487 z += 6; y += 6; 1488 } 1489 } 1490 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1491 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1492 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1493 PetscFunctionReturn(0); 1494 } 1495 1496 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1497 { 1498 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1499 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1500 const PetscScalar *x,*xb; 1501 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1502 const MatScalar *v; 1503 PetscErrorCode ierr; 1504 PetscInt mbs = a->mbs,i,j,n; 1505 const PetscInt *idx,*ii,*ridx = NULL; 1506 PetscBool usecprow=a->compressedrow.use; 1507 1508 PetscFunctionBegin; 1509 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1510 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1511 1512 idx = a->j; 1513 v = a->a; 1514 if (usecprow) { 1515 if (zz != yy) { 1516 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1517 } 1518 mbs = a->compressedrow.nrows; 1519 ii = a->compressedrow.i; 1520 ridx = a->compressedrow.rindex; 1521 } else { 1522 ii = a->i; 1523 y = yarray; 1524 z = zarray; 1525 } 1526 1527 for (i=0; i<mbs; i++) { 1528 n = ii[1] - ii[0]; ii++; 1529 if (usecprow) { 1530 z = zarray + 7*ridx[i]; 1531 y = yarray + 7*ridx[i]; 1532 } 1533 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1534 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1535 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1536 for (j=0; j<n; j++) { 1537 xb = x + 7*(*idx++); 1538 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1539 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1540 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1541 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1542 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1543 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1544 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1545 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1546 v += 49; 1547 } 1548 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1549 if (!usecprow) { 1550 z += 7; y += 7; 1551 } 1552 } 1553 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1554 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1555 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1556 PetscFunctionReturn(0); 1557 } 1558 1559 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 1560 { 1561 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1562 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 1563 const PetscScalar *x,*xb; 1564 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 1565 const MatScalar *v; 1566 PetscErrorCode ierr; 1567 PetscInt mbs = a->mbs,i,j,n; 1568 const PetscInt *idx,*ii,*ridx = NULL; 1569 PetscBool usecprow=a->compressedrow.use; 1570 1571 PetscFunctionBegin; 1572 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1573 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1574 1575 idx = a->j; 1576 v = a->a; 1577 if (usecprow) { 1578 if (zz != yy) { 1579 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1580 } 1581 mbs = a->compressedrow.nrows; 1582 ii = a->compressedrow.i; 1583 ridx = a->compressedrow.rindex; 1584 } else { 1585 ii = a->i; 1586 y = yarray; 1587 z = zarray; 1588 } 1589 1590 for (i=0; i<mbs; i++) { 1591 n = ii[1] - ii[0]; ii++; 1592 if (usecprow) { 1593 z = zarray + 11*ridx[i]; 1594 y = yarray + 11*ridx[i]; 1595 } 1596 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1597 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 1598 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1599 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1600 for (j=0; j<n; j++) { 1601 xb = x + 11*(*idx++); 1602 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 1603 sum1 += v[0]*x1 + v[11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 1604 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 1605 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 1606 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 1607 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 1608 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 1609 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 1610 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 1611 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 1612 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 1613 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 1614 v += 121; 1615 } 1616 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1617 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10; 1618 if (!usecprow) { 1619 z += 11; y += 11; 1620 } 1621 } 1622 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1623 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1624 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 1625 PetscFunctionReturn(0); 1626 } 1627 1628 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1629 { 1630 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1631 PetscScalar *z = 0,*work,*workt,*zarray; 1632 const PetscScalar *x,*xb; 1633 const MatScalar *v; 1634 PetscErrorCode ierr; 1635 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1636 PetscInt ncols,k; 1637 const PetscInt *ridx = NULL,*idx,*ii; 1638 PetscBool usecprow = a->compressedrow.use; 1639 1640 PetscFunctionBegin; 1641 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1642 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1643 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1644 1645 idx = a->j; 1646 v = a->a; 1647 if (usecprow) { 1648 mbs = a->compressedrow.nrows; 1649 ii = a->compressedrow.i; 1650 ridx = a->compressedrow.rindex; 1651 } else { 1652 mbs = a->mbs; 1653 ii = a->i; 1654 z = zarray; 1655 } 1656 1657 if (!a->mult_work) { 1658 k = PetscMax(A->rmap->n,A->cmap->n); 1659 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1660 } 1661 work = a->mult_work; 1662 for (i=0; i<mbs; i++) { 1663 n = ii[1] - ii[0]; ii++; 1664 ncols = n*bs; 1665 workt = work; 1666 for (j=0; j<n; j++) { 1667 xb = x + bs*(*idx++); 1668 for (k=0; k<bs; k++) workt[k] = xb[k]; 1669 workt += bs; 1670 } 1671 if (usecprow) z = zarray + bs*ridx[i]; 1672 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1673 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1674 v += n*bs2; 1675 if (!usecprow) z += bs; 1676 } 1677 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1678 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1679 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1680 PetscFunctionReturn(0); 1681 } 1682 1683 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1684 { 1685 PetscScalar zero = 0.0; 1686 PetscErrorCode ierr; 1687 1688 PetscFunctionBegin; 1689 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1690 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1691 PetscFunctionReturn(0); 1692 } 1693 1694 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1695 { 1696 PetscScalar zero = 0.0; 1697 PetscErrorCode ierr; 1698 1699 PetscFunctionBegin; 1700 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1701 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1702 PetscFunctionReturn(0); 1703 } 1704 1705 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1706 { 1707 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1708 PetscScalar *z,x1,x2,x3,x4,x5; 1709 const PetscScalar *x,*xb = NULL; 1710 const MatScalar *v; 1711 PetscErrorCode ierr; 1712 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1713 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1714 Mat_CompressedRow cprow = a->compressedrow; 1715 PetscBool usecprow = cprow.use; 1716 1717 PetscFunctionBegin; 1718 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1719 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1720 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1721 1722 idx = a->j; 1723 v = a->a; 1724 if (usecprow) { 1725 mbs = cprow.nrows; 1726 ii = cprow.i; 1727 ridx = cprow.rindex; 1728 } else { 1729 mbs=a->mbs; 1730 ii = a->i; 1731 xb = x; 1732 } 1733 1734 switch (bs) { 1735 case 1: 1736 for (i=0; i<mbs; i++) { 1737 if (usecprow) xb = x + ridx[i]; 1738 x1 = xb[0]; 1739 ib = idx + ii[0]; 1740 n = ii[1] - ii[0]; ii++; 1741 for (j=0; j<n; j++) { 1742 rval = ib[j]; 1743 z[rval] += PetscConj(*v) * x1; 1744 v++; 1745 } 1746 if (!usecprow) xb++; 1747 } 1748 break; 1749 case 2: 1750 for (i=0; i<mbs; i++) { 1751 if (usecprow) xb = x + 2*ridx[i]; 1752 x1 = xb[0]; x2 = xb[1]; 1753 ib = idx + ii[0]; 1754 n = ii[1] - ii[0]; ii++; 1755 for (j=0; j<n; j++) { 1756 rval = ib[j]*2; 1757 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1758 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1759 v += 4; 1760 } 1761 if (!usecprow) xb += 2; 1762 } 1763 break; 1764 case 3: 1765 for (i=0; i<mbs; i++) { 1766 if (usecprow) xb = x + 3*ridx[i]; 1767 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1768 ib = idx + ii[0]; 1769 n = ii[1] - ii[0]; ii++; 1770 for (j=0; j<n; j++) { 1771 rval = ib[j]*3; 1772 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1773 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1774 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1775 v += 9; 1776 } 1777 if (!usecprow) xb += 3; 1778 } 1779 break; 1780 case 4: 1781 for (i=0; i<mbs; i++) { 1782 if (usecprow) xb = x + 4*ridx[i]; 1783 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1784 ib = idx + ii[0]; 1785 n = ii[1] - ii[0]; ii++; 1786 for (j=0; j<n; j++) { 1787 rval = ib[j]*4; 1788 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1789 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1790 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1791 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1792 v += 16; 1793 } 1794 if (!usecprow) xb += 4; 1795 } 1796 break; 1797 case 5: 1798 for (i=0; i<mbs; i++) { 1799 if (usecprow) xb = x + 5*ridx[i]; 1800 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1801 x4 = xb[3]; x5 = xb[4]; 1802 ib = idx + ii[0]; 1803 n = ii[1] - ii[0]; ii++; 1804 for (j=0; j<n; j++) { 1805 rval = ib[j]*5; 1806 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1807 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1808 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1809 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1810 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1811 v += 25; 1812 } 1813 if (!usecprow) xb += 5; 1814 } 1815 break; 1816 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1817 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1818 #if 0 1819 { 1820 PetscInt ncols,k,bs2=a->bs2; 1821 PetscScalar *work,*workt,zb; 1822 const PetscScalar *xtmp; 1823 if (!a->mult_work) { 1824 k = PetscMax(A->rmap->n,A->cmap->n); 1825 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1826 } 1827 work = a->mult_work; 1828 xtmp = x; 1829 for (i=0; i<mbs; i++) { 1830 n = ii[1] - ii[0]; ii++; 1831 ncols = n*bs; 1832 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1833 if (usecprow) xtmp = x + bs*ridx[i]; 1834 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1835 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1836 v += n*bs2; 1837 if (!usecprow) xtmp += bs; 1838 workt = work; 1839 for (j=0; j<n; j++) { 1840 zb = z + bs*(*idx++); 1841 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1842 workt += bs; 1843 } 1844 } 1845 } 1846 #endif 1847 } 1848 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1849 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1850 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1851 PetscFunctionReturn(0); 1852 } 1853 1854 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1855 { 1856 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1857 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 1858 const PetscScalar *x,*xb = 0; 1859 const MatScalar *v; 1860 PetscErrorCode ierr; 1861 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 1862 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1863 Mat_CompressedRow cprow = a->compressedrow; 1864 PetscBool usecprow=cprow.use; 1865 1866 PetscFunctionBegin; 1867 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1868 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1869 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1870 1871 idx = a->j; 1872 v = a->a; 1873 if (usecprow) { 1874 mbs = cprow.nrows; 1875 ii = cprow.i; 1876 ridx = cprow.rindex; 1877 } else { 1878 mbs=a->mbs; 1879 ii = a->i; 1880 xb = x; 1881 } 1882 1883 switch (bs) { 1884 case 1: 1885 for (i=0; i<mbs; i++) { 1886 if (usecprow) xb = x + ridx[i]; 1887 x1 = xb[0]; 1888 ib = idx + ii[0]; 1889 n = ii[1] - ii[0]; ii++; 1890 for (j=0; j<n; j++) { 1891 rval = ib[j]; 1892 z[rval] += *v * x1; 1893 v++; 1894 } 1895 if (!usecprow) xb++; 1896 } 1897 break; 1898 case 2: 1899 for (i=0; i<mbs; i++) { 1900 if (usecprow) xb = x + 2*ridx[i]; 1901 x1 = xb[0]; x2 = xb[1]; 1902 ib = idx + ii[0]; 1903 n = ii[1] - ii[0]; ii++; 1904 for (j=0; j<n; j++) { 1905 rval = ib[j]*2; 1906 z[rval++] += v[0]*x1 + v[1]*x2; 1907 z[rval++] += v[2]*x1 + v[3]*x2; 1908 v += 4; 1909 } 1910 if (!usecprow) xb += 2; 1911 } 1912 break; 1913 case 3: 1914 for (i=0; i<mbs; i++) { 1915 if (usecprow) xb = x + 3*ridx[i]; 1916 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1917 ib = idx + ii[0]; 1918 n = ii[1] - ii[0]; ii++; 1919 for (j=0; j<n; j++) { 1920 rval = ib[j]*3; 1921 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1922 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1923 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1924 v += 9; 1925 } 1926 if (!usecprow) xb += 3; 1927 } 1928 break; 1929 case 4: 1930 for (i=0; i<mbs; i++) { 1931 if (usecprow) xb = x + 4*ridx[i]; 1932 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1933 ib = idx + ii[0]; 1934 n = ii[1] - ii[0]; ii++; 1935 for (j=0; j<n; j++) { 1936 rval = ib[j]*4; 1937 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1938 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1939 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1940 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1941 v += 16; 1942 } 1943 if (!usecprow) xb += 4; 1944 } 1945 break; 1946 case 5: 1947 for (i=0; i<mbs; i++) { 1948 if (usecprow) xb = x + 5*ridx[i]; 1949 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1950 x4 = xb[3]; x5 = xb[4]; 1951 ib = idx + ii[0]; 1952 n = ii[1] - ii[0]; ii++; 1953 for (j=0; j<n; j++) { 1954 rval = ib[j]*5; 1955 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1956 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1957 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1958 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1959 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1960 v += 25; 1961 } 1962 if (!usecprow) xb += 5; 1963 } 1964 break; 1965 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1966 PetscInt ncols,k; 1967 PetscScalar *work,*workt; 1968 const PetscScalar *xtmp; 1969 if (!a->mult_work) { 1970 k = PetscMax(A->rmap->n,A->cmap->n); 1971 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1972 } 1973 work = a->mult_work; 1974 xtmp = x; 1975 for (i=0; i<mbs; i++) { 1976 n = ii[1] - ii[0]; ii++; 1977 ncols = n*bs; 1978 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1979 if (usecprow) xtmp = x + bs*ridx[i]; 1980 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1981 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1982 v += n*bs2; 1983 if (!usecprow) xtmp += bs; 1984 workt = work; 1985 for (j=0; j<n; j++) { 1986 zb = z + bs*(*idx++); 1987 for (k=0; k<bs; k++) zb[k] += workt[k]; 1988 workt += bs; 1989 } 1990 } 1991 } 1992 } 1993 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1994 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1995 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1996 PetscFunctionReturn(0); 1997 } 1998 1999 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2000 { 2001 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2002 PetscInt totalnz = a->bs2*a->nz; 2003 PetscScalar oalpha = alpha; 2004 PetscErrorCode ierr; 2005 PetscBLASInt one = 1,tnz; 2006 2007 PetscFunctionBegin; 2008 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2009 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2010 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2011 PetscFunctionReturn(0); 2012 } 2013 2014 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2015 { 2016 PetscErrorCode ierr; 2017 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2018 MatScalar *v = a->a; 2019 PetscReal sum = 0.0; 2020 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2021 2022 PetscFunctionBegin; 2023 if (type == NORM_FROBENIUS) { 2024 #if defined(PETSC_USE_REAL___FP16) 2025 PetscBLASInt one = 1,cnt = bs2*nz; 2026 *norm = BLASnrm2_(&cnt,v,&one); 2027 #else 2028 for (i=0; i<bs2*nz; i++) { 2029 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2030 } 2031 #endif 2032 *norm = PetscSqrtReal(sum); 2033 ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr); 2034 } else if (type == NORM_1) { /* maximum column sum */ 2035 PetscReal *tmp; 2036 PetscInt *bcol = a->j; 2037 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2038 for (i=0; i<nz; i++) { 2039 for (j=0; j<bs; j++) { 2040 k1 = bs*(*bcol) + j; /* column index */ 2041 for (k=0; k<bs; k++) { 2042 tmp[k1] += PetscAbsScalar(*v); v++; 2043 } 2044 } 2045 bcol++; 2046 } 2047 *norm = 0.0; 2048 for (j=0; j<A->cmap->n; j++) { 2049 if (tmp[j] > *norm) *norm = tmp[j]; 2050 } 2051 ierr = PetscFree(tmp);CHKERRQ(ierr); 2052 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2053 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2054 *norm = 0.0; 2055 for (k=0; k<bs; k++) { 2056 for (j=0; j<a->mbs; j++) { 2057 v = a->a + bs2*a->i[j] + k; 2058 sum = 0.0; 2059 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2060 for (k1=0; k1<bs; k1++) { 2061 sum += PetscAbsScalar(*v); 2062 v += bs; 2063 } 2064 } 2065 if (sum > *norm) *norm = sum; 2066 } 2067 } 2068 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2069 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2070 PetscFunctionReturn(0); 2071 } 2072 2073 2074 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2075 { 2076 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2077 PetscErrorCode ierr; 2078 2079 PetscFunctionBegin; 2080 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2081 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2082 *flg = PETSC_FALSE; 2083 PetscFunctionReturn(0); 2084 } 2085 2086 /* if the a->i are the same */ 2087 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2088 if (!*flg) PetscFunctionReturn(0); 2089 2090 /* if a->j are the same */ 2091 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2092 if (!*flg) PetscFunctionReturn(0); 2093 2094 /* if a->a are the same */ 2095 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2096 PetscFunctionReturn(0); 2097 2098 } 2099 2100 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2101 { 2102 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2103 PetscErrorCode ierr; 2104 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2105 PetscScalar *x,zero = 0.0; 2106 MatScalar *aa,*aa_j; 2107 2108 PetscFunctionBegin; 2109 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2110 bs = A->rmap->bs; 2111 aa = a->a; 2112 ai = a->i; 2113 aj = a->j; 2114 ambs = a->mbs; 2115 bs2 = a->bs2; 2116 2117 ierr = VecSet(v,zero);CHKERRQ(ierr); 2118 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2119 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2120 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2121 for (i=0; i<ambs; i++) { 2122 for (j=ai[i]; j<ai[i+1]; j++) { 2123 if (aj[j] == i) { 2124 row = i*bs; 2125 aa_j = aa+j*bs2; 2126 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2127 break; 2128 } 2129 } 2130 } 2131 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2132 PetscFunctionReturn(0); 2133 } 2134 2135 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2136 { 2137 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2138 const PetscScalar *l,*r,*li,*ri; 2139 PetscScalar x; 2140 MatScalar *aa, *v; 2141 PetscErrorCode ierr; 2142 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2143 const PetscInt *ai,*aj; 2144 2145 PetscFunctionBegin; 2146 ai = a->i; 2147 aj = a->j; 2148 aa = a->a; 2149 m = A->rmap->n; 2150 n = A->cmap->n; 2151 bs = A->rmap->bs; 2152 mbs = a->mbs; 2153 bs2 = a->bs2; 2154 if (ll) { 2155 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2156 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2157 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2158 for (i=0; i<mbs; i++) { /* for each block row */ 2159 M = ai[i+1] - ai[i]; 2160 li = l + i*bs; 2161 v = aa + bs2*ai[i]; 2162 for (j=0; j<M; j++) { /* for each block */ 2163 for (k=0; k<bs2; k++) { 2164 (*v++) *= li[k%bs]; 2165 } 2166 } 2167 } 2168 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2169 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2170 } 2171 2172 if (rr) { 2173 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2174 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2175 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2176 for (i=0; i<mbs; i++) { /* for each block row */ 2177 iai = ai[i]; 2178 M = ai[i+1] - iai; 2179 v = aa + bs2*iai; 2180 for (j=0; j<M; j++) { /* for each block */ 2181 ri = r + bs*aj[iai+j]; 2182 for (k=0; k<bs; k++) { 2183 x = ri[k]; 2184 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2185 v += bs; 2186 } 2187 } 2188 } 2189 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2190 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2191 } 2192 PetscFunctionReturn(0); 2193 } 2194 2195 2196 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2197 { 2198 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2199 2200 PetscFunctionBegin; 2201 info->block_size = a->bs2; 2202 info->nz_allocated = a->bs2*a->maxnz; 2203 info->nz_used = a->bs2*a->nz; 2204 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2205 info->assemblies = A->num_ass; 2206 info->mallocs = A->info.mallocs; 2207 info->memory = ((PetscObject)A)->mem; 2208 if (A->factortype) { 2209 info->fill_ratio_given = A->info.fill_ratio_given; 2210 info->fill_ratio_needed = A->info.fill_ratio_needed; 2211 info->factor_mallocs = A->info.factor_mallocs; 2212 } else { 2213 info->fill_ratio_given = 0; 2214 info->fill_ratio_needed = 0; 2215 info->factor_mallocs = 0; 2216 } 2217 PetscFunctionReturn(0); 2218 } 2219 2220 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2221 { 2222 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2223 PetscErrorCode ierr; 2224 2225 PetscFunctionBegin; 2226 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2227 PetscFunctionReturn(0); 2228 } 2229