1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc/private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 7 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 8 { 9 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 10 PetscErrorCode ierr; 11 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 12 const PetscInt *idx; 13 PetscInt start,end,*ai,*aj,bs,*nidx2; 14 PetscBT table; 15 16 PetscFunctionBegin; 17 m = a->mbs; 18 ai = a->i; 19 aj = a->j; 20 bs = A->rmap->bs; 21 22 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 23 24 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 25 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 26 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 27 28 for (i=0; i<is_max; i++) { 29 /* Initialise the two local arrays */ 30 isz = 0; 31 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 32 33 /* Extract the indices, assume there can be duplicate entries */ 34 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 35 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 36 37 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 38 for (j=0; j<n; ++j) { 39 ival = idx[j]/bs; /* convert the indices into block indices */ 40 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 41 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 42 } 43 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 44 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 45 46 k = 0; 47 for (j=0; j<ov; j++) { /* for each overlap*/ 48 n = isz; 49 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 50 row = nidx[k]; 51 start = ai[row]; 52 end = ai[row+1]; 53 for (l = start; l<end; l++) { 54 val = aj[l]; 55 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 56 } 57 } 58 } 59 /* expand the Index Set */ 60 for (j=0; j<isz; j++) { 61 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 62 } 63 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 64 } 65 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 66 ierr = PetscFree(nidx);CHKERRQ(ierr); 67 ierr = PetscFree(nidx2);CHKERRQ(ierr); 68 PetscFunctionReturn(0); 69 } 70 71 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 72 { 73 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 74 PetscErrorCode ierr; 75 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 76 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 77 const PetscInt *irow,*icol; 78 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 79 PetscInt *aj = a->j,*ai = a->i; 80 MatScalar *mat_a; 81 Mat C; 82 PetscBool flag; 83 84 PetscFunctionBegin; 85 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 86 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 87 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 88 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 89 90 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 91 ssmap = smap; 92 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 93 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 94 /* determine lens of each row */ 95 for (i=0; i<nrows; i++) { 96 kstart = ai[irow[i]]; 97 kend = kstart + a->ilen[irow[i]]; 98 lens[i] = 0; 99 for (k=kstart; k<kend; k++) { 100 if (ssmap[aj[k]]) lens[i]++; 101 } 102 } 103 /* Create and fill new matrix */ 104 if (scall == MAT_REUSE_MATRIX) { 105 c = (Mat_SeqBAIJ*)((*B)->data); 106 107 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 108 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 109 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 110 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 111 C = *B; 112 } else { 113 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 114 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 115 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 116 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr); 117 } 118 c = (Mat_SeqBAIJ*)(C->data); 119 for (i=0; i<nrows; i++) { 120 row = irow[i]; 121 kstart = ai[row]; 122 kend = kstart + a->ilen[row]; 123 mat_i = c->i[i]; 124 mat_j = c->j + mat_i; 125 mat_a = c->a + mat_i*bs2; 126 mat_ilen = c->ilen + i; 127 for (k=kstart; k<kend; k++) { 128 if ((tcol=ssmap[a->j[k]])) { 129 *mat_j++ = tcol - 1; 130 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 131 mat_a += bs2; 132 (*mat_ilen)++; 133 } 134 } 135 } 136 /* sort */ 137 { 138 MatScalar *work; 139 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 140 for (i=0; i<nrows; i++) { 141 PetscInt ilen; 142 mat_i = c->i[i]; 143 mat_j = c->j + mat_i; 144 mat_a = c->a + mat_i*bs2; 145 ilen = c->ilen[i]; 146 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 147 } 148 ierr = PetscFree(work);CHKERRQ(ierr); 149 } 150 151 /* Free work space */ 152 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 153 ierr = PetscFree(smap);CHKERRQ(ierr); 154 ierr = PetscFree(lens);CHKERRQ(ierr); 155 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 156 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 157 158 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 159 *B = C; 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 164 { 165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 166 IS is1,is2; 167 PetscErrorCode ierr; 168 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j; 169 const PetscInt *irow,*icol; 170 171 PetscFunctionBegin; 172 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 173 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 175 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 176 177 /* Verify if the indices corespond to each element in a block 178 and form the IS with compressed IS */ 179 maxmnbs = PetscMax(a->mbs,a->nbs); 180 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr); 181 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 182 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 183 for (i=0; i<a->mbs; i++) { 184 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 185 } 186 count = 0; 187 for (i=0; i<nrows; i++) { 188 j = irow[i] / bs; 189 if ((vary[j]--)==bs) iary[count++] = j; 190 } 191 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 192 193 ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr); 194 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 195 for (i=0; i<a->nbs; i++) { 196 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 197 } 198 count = 0; 199 for (i=0; i<ncols; i++) { 200 j = icol[i] / bs; 201 if ((vary[j]--)==bs) iary[count++] = j; 202 } 203 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 204 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 205 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 206 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 207 208 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 209 ierr = ISDestroy(&is1);CHKERRQ(ierr); 210 ierr = ISDestroy(&is2);CHKERRQ(ierr); 211 PetscFunctionReturn(0); 212 } 213 214 PetscErrorCode MatDestroy_SeqBAIJ_Submatrices(Mat C) 215 { 216 PetscErrorCode ierr; 217 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data; 218 Mat_SubSppt *submatj = c->submatis1; 219 220 PetscFunctionBegin; 221 ierr = submatj->destroy(C);CHKERRQ(ierr); 222 ierr = MatDestroySubMatrices_Private(submatj);CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 227 { 228 PetscErrorCode ierr; 229 PetscInt i; 230 231 PetscFunctionBegin; 232 if (scall == MAT_INITIAL_MATRIX) { 233 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr); 234 } 235 236 for (i=0; i<n; i++) { 237 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 238 } 239 PetscFunctionReturn(0); 240 } 241 242 243 /* -------------------------------------------------------*/ 244 /* Should check that shapes of vectors and matrices match */ 245 /* -------------------------------------------------------*/ 246 247 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 248 { 249 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 250 PetscScalar *z,sum; 251 const PetscScalar *x; 252 const MatScalar *v; 253 PetscErrorCode ierr; 254 PetscInt mbs,i,n; 255 const PetscInt *idx,*ii,*ridx=NULL; 256 PetscBool usecprow=a->compressedrow.use; 257 258 PetscFunctionBegin; 259 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 260 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 261 262 if (usecprow) { 263 mbs = a->compressedrow.nrows; 264 ii = a->compressedrow.i; 265 ridx = a->compressedrow.rindex; 266 ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 267 } else { 268 mbs = a->mbs; 269 ii = a->i; 270 } 271 272 for (i=0; i<mbs; i++) { 273 n = ii[1] - ii[0]; 274 v = a->a + ii[0]; 275 idx = a->j + ii[0]; 276 ii++; 277 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 278 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 279 sum = 0.0; 280 PetscSparseDensePlusDot(sum,x,v,idx,n); 281 if (usecprow) { 282 z[ridx[i]] = sum; 283 } else { 284 z[i] = sum; 285 } 286 } 287 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 288 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 289 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 290 PetscFunctionReturn(0); 291 } 292 293 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 294 { 295 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 296 PetscScalar *z = 0,sum1,sum2,*zarray; 297 const PetscScalar *x,*xb; 298 PetscScalar x1,x2; 299 const MatScalar *v; 300 PetscErrorCode ierr; 301 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 302 PetscBool usecprow=a->compressedrow.use; 303 304 PetscFunctionBegin; 305 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 306 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 307 308 idx = a->j; 309 v = a->a; 310 if (usecprow) { 311 mbs = a->compressedrow.nrows; 312 ii = a->compressedrow.i; 313 ridx = a->compressedrow.rindex; 314 ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 315 } else { 316 mbs = a->mbs; 317 ii = a->i; 318 z = zarray; 319 } 320 321 for (i=0; i<mbs; i++) { 322 n = ii[1] - ii[0]; ii++; 323 sum1 = 0.0; sum2 = 0.0; 324 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 325 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 326 for (j=0; j<n; j++) { 327 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 328 sum1 += v[0]*x1 + v[2]*x2; 329 sum2 += v[1]*x1 + v[3]*x2; 330 v += 4; 331 } 332 if (usecprow) z = zarray + 2*ridx[i]; 333 z[0] = sum1; z[1] = sum2; 334 if (!usecprow) z += 2; 335 } 336 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 337 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 338 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 339 PetscFunctionReturn(0); 340 } 341 342 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 343 { 344 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 345 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 346 const PetscScalar *x,*xb; 347 const MatScalar *v; 348 PetscErrorCode ierr; 349 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 350 PetscBool usecprow=a->compressedrow.use; 351 352 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 353 #pragma disjoint(*v,*z,*xb) 354 #endif 355 356 PetscFunctionBegin; 357 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 358 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 359 360 idx = a->j; 361 v = a->a; 362 if (usecprow) { 363 mbs = a->compressedrow.nrows; 364 ii = a->compressedrow.i; 365 ridx = a->compressedrow.rindex; 366 ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 367 } else { 368 mbs = a->mbs; 369 ii = a->i; 370 z = zarray; 371 } 372 373 for (i=0; i<mbs; i++) { 374 n = ii[1] - ii[0]; ii++; 375 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 376 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 377 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 378 for (j=0; j<n; j++) { 379 xb = x + 3*(*idx++); 380 x1 = xb[0]; 381 x2 = xb[1]; 382 x3 = xb[2]; 383 384 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 385 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 386 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 387 v += 9; 388 } 389 if (usecprow) z = zarray + 3*ridx[i]; 390 z[0] = sum1; z[1] = sum2; z[2] = sum3; 391 if (!usecprow) z += 3; 392 } 393 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 394 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 395 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 396 PetscFunctionReturn(0); 397 } 398 399 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 400 { 401 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 402 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 403 const PetscScalar *x,*xb; 404 const MatScalar *v; 405 PetscErrorCode ierr; 406 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 407 PetscBool usecprow=a->compressedrow.use; 408 409 PetscFunctionBegin; 410 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 411 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 412 413 idx = a->j; 414 v = a->a; 415 if (usecprow) { 416 mbs = a->compressedrow.nrows; 417 ii = a->compressedrow.i; 418 ridx = a->compressedrow.rindex; 419 ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 420 } else { 421 mbs = a->mbs; 422 ii = a->i; 423 z = zarray; 424 } 425 426 for (i=0; i<mbs; i++) { 427 n = ii[1] - ii[0]; 428 ii++; 429 sum1 = 0.0; 430 sum2 = 0.0; 431 sum3 = 0.0; 432 sum4 = 0.0; 433 434 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 435 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 436 for (j=0; j<n; j++) { 437 xb = x + 4*(*idx++); 438 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 439 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 440 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 441 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 442 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 443 v += 16; 444 } 445 if (usecprow) z = zarray + 4*ridx[i]; 446 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 447 if (!usecprow) z += 4; 448 } 449 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 450 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 451 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 452 PetscFunctionReturn(0); 453 } 454 455 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 456 { 457 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 458 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 459 const PetscScalar *xb,*x; 460 const MatScalar *v; 461 PetscErrorCode ierr; 462 const PetscInt *idx,*ii,*ridx=NULL; 463 PetscInt mbs,i,j,n; 464 PetscBool usecprow=a->compressedrow.use; 465 466 PetscFunctionBegin; 467 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 468 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 469 470 idx = a->j; 471 v = a->a; 472 if (usecprow) { 473 mbs = a->compressedrow.nrows; 474 ii = a->compressedrow.i; 475 ridx = a->compressedrow.rindex; 476 ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 477 } else { 478 mbs = a->mbs; 479 ii = a->i; 480 z = zarray; 481 } 482 483 for (i=0; i<mbs; i++) { 484 n = ii[1] - ii[0]; ii++; 485 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 486 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 487 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 488 for (j=0; j<n; j++) { 489 xb = x + 5*(*idx++); 490 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 491 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 492 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 493 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 494 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 495 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 496 v += 25; 497 } 498 if (usecprow) z = zarray + 5*ridx[i]; 499 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 500 if (!usecprow) z += 5; 501 } 502 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 503 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 504 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 505 PetscFunctionReturn(0); 506 } 507 508 509 510 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 511 { 512 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 513 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 514 const PetscScalar *x,*xb; 515 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 516 const MatScalar *v; 517 PetscErrorCode ierr; 518 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 519 PetscBool usecprow=a->compressedrow.use; 520 521 PetscFunctionBegin; 522 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 523 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 524 525 idx = a->j; 526 v = a->a; 527 if (usecprow) { 528 mbs = a->compressedrow.nrows; 529 ii = a->compressedrow.i; 530 ridx = a->compressedrow.rindex; 531 ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 532 } else { 533 mbs = a->mbs; 534 ii = a->i; 535 z = zarray; 536 } 537 538 for (i=0; i<mbs; i++) { 539 n = ii[1] - ii[0]; 540 ii++; 541 sum1 = 0.0; 542 sum2 = 0.0; 543 sum3 = 0.0; 544 sum4 = 0.0; 545 sum5 = 0.0; 546 sum6 = 0.0; 547 548 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 549 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 550 for (j=0; j<n; j++) { 551 xb = x + 6*(*idx++); 552 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 553 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 554 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 555 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 556 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 557 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 558 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 559 v += 36; 560 } 561 if (usecprow) z = zarray + 6*ridx[i]; 562 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 563 if (!usecprow) z += 6; 564 } 565 566 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 567 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 568 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 569 PetscFunctionReturn(0); 570 } 571 572 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 573 { 574 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 575 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 576 const PetscScalar *x,*xb; 577 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 578 const MatScalar *v; 579 PetscErrorCode ierr; 580 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 581 PetscBool usecprow=a->compressedrow.use; 582 583 PetscFunctionBegin; 584 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 585 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 586 587 idx = a->j; 588 v = a->a; 589 if (usecprow) { 590 mbs = a->compressedrow.nrows; 591 ii = a->compressedrow.i; 592 ridx = a->compressedrow.rindex; 593 ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 594 } else { 595 mbs = a->mbs; 596 ii = a->i; 597 z = zarray; 598 } 599 600 for (i=0; i<mbs; i++) { 601 n = ii[1] - ii[0]; 602 ii++; 603 sum1 = 0.0; 604 sum2 = 0.0; 605 sum3 = 0.0; 606 sum4 = 0.0; 607 sum5 = 0.0; 608 sum6 = 0.0; 609 sum7 = 0.0; 610 611 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 612 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 613 for (j=0; j<n; j++) { 614 xb = x + 7*(*idx++); 615 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 616 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 617 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 618 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 619 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 620 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 621 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 622 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 623 v += 49; 624 } 625 if (usecprow) z = zarray + 7*ridx[i]; 626 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 627 if (!usecprow) z += 7; 628 } 629 630 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 631 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 632 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 633 PetscFunctionReturn(0); 634 } 635 636 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz) 637 { 638 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 639 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 640 const PetscScalar *x,*xb; 641 PetscScalar *zarray,xv; 642 const MatScalar *v; 643 PetscErrorCode ierr; 644 const PetscInt *ii,*ij=a->j,*idx; 645 PetscInt mbs,i,j,k,n,*ridx=NULL; 646 PetscBool usecprow=a->compressedrow.use; 647 648 PetscFunctionBegin; 649 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 650 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 651 652 v = a->a; 653 if (usecprow) { 654 mbs = a->compressedrow.nrows; 655 ii = a->compressedrow.i; 656 ridx = a->compressedrow.rindex; 657 ierr = PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 658 } else { 659 mbs = a->mbs; 660 ii = a->i; 661 z = zarray; 662 } 663 664 for (i=0; i<mbs; i++) { 665 n = ii[i+1] - ii[i]; 666 idx = ij + ii[i]; 667 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 668 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; 669 670 for (j=0; j<n; j++) { 671 xb = x + 11*(idx[j]); 672 673 for (k=0; k<11; k++) { 674 xv = xb[k]; 675 sum1 += v[0]*xv; 676 sum2 += v[1]*xv; 677 sum3 += v[2]*xv; 678 sum4 += v[3]*xv; 679 sum5 += v[4]*xv; 680 sum6 += v[5]*xv; 681 sum7 += v[6]*xv; 682 sum8 += v[7]*xv; 683 sum9 += v[8]*xv; 684 sum10 += v[9]*xv; 685 sum11 += v[10]*xv; 686 v += 11; 687 } 688 } 689 if (usecprow) z = zarray + 11*ridx[i]; 690 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 691 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; 692 693 if (!usecprow) z += 11; 694 } 695 696 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 697 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 698 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 703 /* Default MatMult for block size 15 */ 704 705 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 706 { 707 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 708 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 709 const PetscScalar *x,*xb; 710 PetscScalar *zarray,xv; 711 const MatScalar *v; 712 PetscErrorCode ierr; 713 const PetscInt *ii,*ij=a->j,*idx; 714 PetscInt mbs,i,j,k,n,*ridx=NULL; 715 PetscBool usecprow=a->compressedrow.use; 716 717 PetscFunctionBegin; 718 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 719 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 720 721 v = a->a; 722 if (usecprow) { 723 mbs = a->compressedrow.nrows; 724 ii = a->compressedrow.i; 725 ridx = a->compressedrow.rindex; 726 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 727 } else { 728 mbs = a->mbs; 729 ii = a->i; 730 z = zarray; 731 } 732 733 for (i=0; i<mbs; i++) { 734 n = ii[i+1] - ii[i]; 735 idx = ij + ii[i]; 736 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 737 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 738 739 for (j=0; j<n; j++) { 740 xb = x + 15*(idx[j]); 741 742 for (k=0; k<15; k++) { 743 xv = xb[k]; 744 sum1 += v[0]*xv; 745 sum2 += v[1]*xv; 746 sum3 += v[2]*xv; 747 sum4 += v[3]*xv; 748 sum5 += v[4]*xv; 749 sum6 += v[5]*xv; 750 sum7 += v[6]*xv; 751 sum8 += v[7]*xv; 752 sum9 += v[8]*xv; 753 sum10 += v[9]*xv; 754 sum11 += v[10]*xv; 755 sum12 += v[11]*xv; 756 sum13 += v[12]*xv; 757 sum14 += v[13]*xv; 758 sum15 += v[14]*xv; 759 v += 15; 760 } 761 } 762 if (usecprow) z = zarray + 15*ridx[i]; 763 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 764 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 765 766 if (!usecprow) z += 15; 767 } 768 769 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 771 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 772 PetscFunctionReturn(0); 773 } 774 775 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 776 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 777 { 778 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 779 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 780 const PetscScalar *x,*xb; 781 PetscScalar x1,x2,x3,x4,*zarray; 782 const MatScalar *v; 783 PetscErrorCode ierr; 784 const PetscInt *ii,*ij=a->j,*idx; 785 PetscInt mbs,i,j,n,*ridx=NULL; 786 PetscBool usecprow=a->compressedrow.use; 787 788 PetscFunctionBegin; 789 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 790 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 791 792 v = a->a; 793 if (usecprow) { 794 mbs = a->compressedrow.nrows; 795 ii = a->compressedrow.i; 796 ridx = a->compressedrow.rindex; 797 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 798 } else { 799 mbs = a->mbs; 800 ii = a->i; 801 z = zarray; 802 } 803 804 for (i=0; i<mbs; i++) { 805 n = ii[i+1] - ii[i]; 806 idx = ij + ii[i]; 807 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 808 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 809 810 for (j=0; j<n; j++) { 811 xb = x + 15*(idx[j]); 812 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 813 814 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 815 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 816 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 817 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 818 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 819 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 820 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 821 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 822 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 823 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 824 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 825 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 826 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 827 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 828 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 829 830 v += 60; 831 832 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 833 834 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 835 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 836 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 837 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 838 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 839 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 840 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 841 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 842 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 843 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 844 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 845 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 846 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 847 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 848 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 849 v += 60; 850 851 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 852 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 853 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 854 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 855 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 856 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 857 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 858 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 859 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 860 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 861 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 862 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 863 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 864 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 865 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 866 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 867 v += 60; 868 869 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 870 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 871 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 872 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 873 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 874 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 875 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 876 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 877 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 878 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 879 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 880 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 881 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 882 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 883 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 884 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 885 v += 45; 886 } 887 if (usecprow) z = zarray + 15*ridx[i]; 888 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 889 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 890 891 if (!usecprow) z += 15; 892 } 893 894 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 895 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 896 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 897 PetscFunctionReturn(0); 898 } 899 900 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 901 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 902 { 903 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 904 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 905 const PetscScalar *x,*xb; 906 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 907 const MatScalar *v; 908 PetscErrorCode ierr; 909 const PetscInt *ii,*ij=a->j,*idx; 910 PetscInt mbs,i,j,n,*ridx=NULL; 911 PetscBool usecprow=a->compressedrow.use; 912 913 PetscFunctionBegin; 914 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 915 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 916 917 v = a->a; 918 if (usecprow) { 919 mbs = a->compressedrow.nrows; 920 ii = a->compressedrow.i; 921 ridx = a->compressedrow.rindex; 922 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 923 } else { 924 mbs = a->mbs; 925 ii = a->i; 926 z = zarray; 927 } 928 929 for (i=0; i<mbs; i++) { 930 n = ii[i+1] - ii[i]; 931 idx = ij + ii[i]; 932 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 933 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 934 935 for (j=0; j<n; j++) { 936 xb = x + 15*(idx[j]); 937 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 938 x8 = xb[7]; 939 940 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 941 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 942 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 943 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 944 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 945 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 946 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 947 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 948 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 949 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 950 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 951 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 952 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 953 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 954 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 955 v += 120; 956 957 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 958 959 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 960 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 961 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 962 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 963 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 964 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 965 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 966 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 967 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 968 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 969 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 970 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 971 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 972 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 973 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 974 v += 105; 975 } 976 if (usecprow) z = zarray + 15*ridx[i]; 977 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 978 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 979 980 if (!usecprow) z += 15; 981 } 982 983 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 984 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 985 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 986 PetscFunctionReturn(0); 987 } 988 989 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 990 991 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 992 { 993 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 994 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 995 const PetscScalar *x,*xb; 996 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 997 const MatScalar *v; 998 PetscErrorCode ierr; 999 const PetscInt *ii,*ij=a->j,*idx; 1000 PetscInt mbs,i,j,n,*ridx=NULL; 1001 PetscBool usecprow=a->compressedrow.use; 1002 1003 PetscFunctionBegin; 1004 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1005 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1006 1007 v = a->a; 1008 if (usecprow) { 1009 mbs = a->compressedrow.nrows; 1010 ii = a->compressedrow.i; 1011 ridx = a->compressedrow.rindex; 1012 ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1013 } else { 1014 mbs = a->mbs; 1015 ii = a->i; 1016 z = zarray; 1017 } 1018 1019 for (i=0; i<mbs; i++) { 1020 n = ii[i+1] - ii[i]; 1021 idx = ij + ii[i]; 1022 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1023 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1024 1025 for (j=0; j<n; j++) { 1026 xb = x + 15*(idx[j]); 1027 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1028 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1029 1030 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1031 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1032 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1033 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1034 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1035 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1036 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1037 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1038 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1039 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1040 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1041 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1042 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1043 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1044 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1045 v += 225; 1046 } 1047 if (usecprow) z = zarray + 15*ridx[i]; 1048 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1049 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1050 1051 if (!usecprow) z += 15; 1052 } 1053 1054 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1055 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1056 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1057 PetscFunctionReturn(0); 1058 } 1059 1060 1061 /* 1062 This will not work with MatScalar == float because it calls the BLAS 1063 */ 1064 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1065 { 1066 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1067 PetscScalar *z = 0,*work,*workt,*zarray; 1068 const PetscScalar *x,*xb; 1069 const MatScalar *v; 1070 PetscErrorCode ierr; 1071 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1072 const PetscInt *idx,*ii,*ridx=NULL; 1073 PetscInt ncols,k; 1074 PetscBool usecprow=a->compressedrow.use; 1075 1076 PetscFunctionBegin; 1077 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1078 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1079 1080 idx = a->j; 1081 v = a->a; 1082 if (usecprow) { 1083 mbs = a->compressedrow.nrows; 1084 ii = a->compressedrow.i; 1085 ridx = a->compressedrow.rindex; 1086 ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1087 } else { 1088 mbs = a->mbs; 1089 ii = a->i; 1090 z = zarray; 1091 } 1092 1093 if (!a->mult_work) { 1094 k = PetscMax(A->rmap->n,A->cmap->n); 1095 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1096 } 1097 work = a->mult_work; 1098 for (i=0; i<mbs; i++) { 1099 n = ii[1] - ii[0]; ii++; 1100 ncols = n*bs; 1101 workt = work; 1102 for (j=0; j<n; j++) { 1103 xb = x + bs*(*idx++); 1104 for (k=0; k<bs; k++) workt[k] = xb[k]; 1105 workt += bs; 1106 } 1107 if (usecprow) z = zarray + bs*ridx[i]; 1108 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1109 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1110 v += n*bs2; 1111 if (!usecprow) z += bs; 1112 } 1113 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1114 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1115 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1116 PetscFunctionReturn(0); 1117 } 1118 1119 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1120 { 1121 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1122 const PetscScalar *x; 1123 PetscScalar *y,*z,sum; 1124 const MatScalar *v; 1125 PetscErrorCode ierr; 1126 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1127 const PetscInt *idx,*ii; 1128 PetscBool usecprow=a->compressedrow.use; 1129 1130 PetscFunctionBegin; 1131 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1132 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1133 1134 idx = a->j; 1135 v = a->a; 1136 if (usecprow) { 1137 if (zz != yy) { 1138 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1139 } 1140 mbs = a->compressedrow.nrows; 1141 ii = a->compressedrow.i; 1142 ridx = a->compressedrow.rindex; 1143 } else { 1144 ii = a->i; 1145 } 1146 1147 for (i=0; i<mbs; i++) { 1148 n = ii[1] - ii[0]; 1149 ii++; 1150 if (!usecprow) { 1151 sum = y[i]; 1152 } else { 1153 sum = y[ridx[i]]; 1154 } 1155 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1156 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1157 PetscSparseDensePlusDot(sum,x,v,idx,n); 1158 v += n; 1159 idx += n; 1160 if (usecprow) { 1161 z[ridx[i]] = sum; 1162 } else { 1163 z[i] = sum; 1164 } 1165 } 1166 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1167 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1173 { 1174 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1175 PetscScalar *y = 0,*z = 0,sum1,sum2; 1176 const PetscScalar *x,*xb; 1177 PetscScalar x1,x2,*yarray,*zarray; 1178 const MatScalar *v; 1179 PetscErrorCode ierr; 1180 PetscInt mbs = a->mbs,i,n,j; 1181 const PetscInt *idx,*ii,*ridx = NULL; 1182 PetscBool usecprow = a->compressedrow.use; 1183 1184 PetscFunctionBegin; 1185 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1186 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1187 1188 idx = a->j; 1189 v = a->a; 1190 if (usecprow) { 1191 if (zz != yy) { 1192 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1193 } 1194 mbs = a->compressedrow.nrows; 1195 ii = a->compressedrow.i; 1196 ridx = a->compressedrow.rindex; 1197 } else { 1198 ii = a->i; 1199 y = yarray; 1200 z = zarray; 1201 } 1202 1203 for (i=0; i<mbs; i++) { 1204 n = ii[1] - ii[0]; ii++; 1205 if (usecprow) { 1206 z = zarray + 2*ridx[i]; 1207 y = yarray + 2*ridx[i]; 1208 } 1209 sum1 = y[0]; sum2 = y[1]; 1210 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1211 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1212 for (j=0; j<n; j++) { 1213 xb = x + 2*(*idx++); 1214 x1 = xb[0]; 1215 x2 = xb[1]; 1216 1217 sum1 += v[0]*x1 + v[2]*x2; 1218 sum2 += v[1]*x1 + v[3]*x2; 1219 v += 4; 1220 } 1221 z[0] = sum1; z[1] = sum2; 1222 if (!usecprow) { 1223 z += 2; y += 2; 1224 } 1225 } 1226 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1227 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1228 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1229 PetscFunctionReturn(0); 1230 } 1231 1232 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1233 { 1234 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1235 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1236 const PetscScalar *x,*xb; 1237 const MatScalar *v; 1238 PetscErrorCode ierr; 1239 PetscInt mbs = a->mbs,i,j,n; 1240 const PetscInt *idx,*ii,*ridx = NULL; 1241 PetscBool usecprow = a->compressedrow.use; 1242 1243 PetscFunctionBegin; 1244 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1245 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1246 1247 idx = a->j; 1248 v = a->a; 1249 if (usecprow) { 1250 if (zz != yy) { 1251 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1252 } 1253 mbs = a->compressedrow.nrows; 1254 ii = a->compressedrow.i; 1255 ridx = a->compressedrow.rindex; 1256 } else { 1257 ii = a->i; 1258 y = yarray; 1259 z = zarray; 1260 } 1261 1262 for (i=0; i<mbs; i++) { 1263 n = ii[1] - ii[0]; ii++; 1264 if (usecprow) { 1265 z = zarray + 3*ridx[i]; 1266 y = yarray + 3*ridx[i]; 1267 } 1268 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1269 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1270 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1271 for (j=0; j<n; j++) { 1272 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1273 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1274 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1275 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1276 v += 9; 1277 } 1278 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1279 if (!usecprow) { 1280 z += 3; y += 3; 1281 } 1282 } 1283 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1284 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1285 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1286 PetscFunctionReturn(0); 1287 } 1288 1289 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1290 { 1291 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1292 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1293 const PetscScalar *x,*xb; 1294 const MatScalar *v; 1295 PetscErrorCode ierr; 1296 PetscInt mbs = a->mbs,i,j,n; 1297 const PetscInt *idx,*ii,*ridx=NULL; 1298 PetscBool usecprow=a->compressedrow.use; 1299 1300 PetscFunctionBegin; 1301 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1302 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1303 1304 idx = a->j; 1305 v = a->a; 1306 if (usecprow) { 1307 if (zz != yy) { 1308 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1309 } 1310 mbs = a->compressedrow.nrows; 1311 ii = a->compressedrow.i; 1312 ridx = a->compressedrow.rindex; 1313 } else { 1314 ii = a->i; 1315 y = yarray; 1316 z = zarray; 1317 } 1318 1319 for (i=0; i<mbs; i++) { 1320 n = ii[1] - ii[0]; ii++; 1321 if (usecprow) { 1322 z = zarray + 4*ridx[i]; 1323 y = yarray + 4*ridx[i]; 1324 } 1325 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1326 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1327 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1328 for (j=0; j<n; j++) { 1329 xb = x + 4*(*idx++); 1330 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1331 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1332 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1333 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1334 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1335 v += 16; 1336 } 1337 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1338 if (!usecprow) { 1339 z += 4; y += 4; 1340 } 1341 } 1342 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1343 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1344 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1349 { 1350 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1351 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1352 const PetscScalar *x,*xb; 1353 PetscScalar *yarray,*zarray; 1354 const MatScalar *v; 1355 PetscErrorCode ierr; 1356 PetscInt mbs = a->mbs,i,j,n; 1357 const PetscInt *idx,*ii,*ridx = NULL; 1358 PetscBool usecprow=a->compressedrow.use; 1359 1360 PetscFunctionBegin; 1361 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1362 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1363 1364 idx = a->j; 1365 v = a->a; 1366 if (usecprow) { 1367 if (zz != yy) { 1368 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1369 } 1370 mbs = a->compressedrow.nrows; 1371 ii = a->compressedrow.i; 1372 ridx = a->compressedrow.rindex; 1373 } else { 1374 ii = a->i; 1375 y = yarray; 1376 z = zarray; 1377 } 1378 1379 for (i=0; i<mbs; i++) { 1380 n = ii[1] - ii[0]; ii++; 1381 if (usecprow) { 1382 z = zarray + 5*ridx[i]; 1383 y = yarray + 5*ridx[i]; 1384 } 1385 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1386 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1387 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1388 for (j=0; j<n; j++) { 1389 xb = x + 5*(*idx++); 1390 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1391 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1392 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1393 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1394 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1395 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1396 v += 25; 1397 } 1398 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1399 if (!usecprow) { 1400 z += 5; y += 5; 1401 } 1402 } 1403 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1404 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1405 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1406 PetscFunctionReturn(0); 1407 } 1408 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1409 { 1410 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1411 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1412 const PetscScalar *x,*xb; 1413 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1414 const MatScalar *v; 1415 PetscErrorCode ierr; 1416 PetscInt mbs = a->mbs,i,j,n; 1417 const PetscInt *idx,*ii,*ridx=NULL; 1418 PetscBool usecprow=a->compressedrow.use; 1419 1420 PetscFunctionBegin; 1421 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1422 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1423 1424 idx = a->j; 1425 v = a->a; 1426 if (usecprow) { 1427 if (zz != yy) { 1428 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1429 } 1430 mbs = a->compressedrow.nrows; 1431 ii = a->compressedrow.i; 1432 ridx = a->compressedrow.rindex; 1433 } else { 1434 ii = a->i; 1435 y = yarray; 1436 z = zarray; 1437 } 1438 1439 for (i=0; i<mbs; i++) { 1440 n = ii[1] - ii[0]; ii++; 1441 if (usecprow) { 1442 z = zarray + 6*ridx[i]; 1443 y = yarray + 6*ridx[i]; 1444 } 1445 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1446 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1447 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1448 for (j=0; j<n; j++) { 1449 xb = x + 6*(*idx++); 1450 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1451 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1452 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1453 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1454 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1455 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1456 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1457 v += 36; 1458 } 1459 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1460 if (!usecprow) { 1461 z += 6; y += 6; 1462 } 1463 } 1464 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1465 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1466 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1467 PetscFunctionReturn(0); 1468 } 1469 1470 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 1471 { 1472 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1473 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1474 const PetscScalar *x,*xb; 1475 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 1476 const MatScalar *v; 1477 PetscErrorCode ierr; 1478 PetscInt mbs = a->mbs,i,j,n; 1479 const PetscInt *idx,*ii,*ridx = NULL; 1480 PetscBool usecprow=a->compressedrow.use; 1481 1482 PetscFunctionBegin; 1483 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1484 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1485 1486 idx = a->j; 1487 v = a->a; 1488 if (usecprow) { 1489 if (zz != yy) { 1490 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1491 } 1492 mbs = a->compressedrow.nrows; 1493 ii = a->compressedrow.i; 1494 ridx = a->compressedrow.rindex; 1495 } else { 1496 ii = a->i; 1497 y = yarray; 1498 z = zarray; 1499 } 1500 1501 for (i=0; i<mbs; i++) { 1502 n = ii[1] - ii[0]; ii++; 1503 if (usecprow) { 1504 z = zarray + 7*ridx[i]; 1505 y = yarray + 7*ridx[i]; 1506 } 1507 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1508 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1509 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1510 for (j=0; j<n; j++) { 1511 xb = x + 7*(*idx++); 1512 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1513 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1514 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1515 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1516 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1517 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1518 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1519 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1520 v += 49; 1521 } 1522 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1523 if (!usecprow) { 1524 z += 7; y += 7; 1525 } 1526 } 1527 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1528 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1529 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 1530 PetscFunctionReturn(0); 1531 } 1532 1533 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz) 1534 { 1535 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1536 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11; 1537 const PetscScalar *x,*xb; 1538 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray; 1539 const MatScalar *v; 1540 PetscErrorCode ierr; 1541 PetscInt mbs = a->mbs,i,j,n; 1542 const PetscInt *idx,*ii,*ridx = NULL; 1543 PetscBool usecprow=a->compressedrow.use; 1544 1545 PetscFunctionBegin; 1546 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1547 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1548 1549 idx = a->j; 1550 v = a->a; 1551 if (usecprow) { 1552 if (zz != yy) { 1553 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1554 } 1555 mbs = a->compressedrow.nrows; 1556 ii = a->compressedrow.i; 1557 ridx = a->compressedrow.rindex; 1558 } else { 1559 ii = a->i; 1560 y = yarray; 1561 z = zarray; 1562 } 1563 1564 for (i=0; i<mbs; i++) { 1565 n = ii[1] - ii[0]; ii++; 1566 if (usecprow) { 1567 z = zarray + 11*ridx[i]; 1568 y = yarray + 11*ridx[i]; 1569 } 1570 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 1571 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; 1572 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1573 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1574 for (j=0; j<n; j++) { 1575 xb = x + 11*(*idx++); 1576 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; 1577 sum1 += v[0]*x1 + v[11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11; 1578 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11; 1579 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11; 1580 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11; 1581 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11; 1582 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11; 1583 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11; 1584 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11; 1585 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11; 1586 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11; 1587 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11; 1588 v += 121; 1589 } 1590 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1591 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10; 1592 if (!usecprow) { 1593 z += 11; y += 11; 1594 } 1595 } 1596 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1597 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1598 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr); 1599 PetscFunctionReturn(0); 1600 } 1601 1602 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 1603 { 1604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1605 PetscScalar *z = 0,*work,*workt,*zarray; 1606 const PetscScalar *x,*xb; 1607 const MatScalar *v; 1608 PetscErrorCode ierr; 1609 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1610 PetscInt ncols,k; 1611 const PetscInt *ridx = NULL,*idx,*ii; 1612 PetscBool usecprow = a->compressedrow.use; 1613 1614 PetscFunctionBegin; 1615 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 1616 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1617 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1618 1619 idx = a->j; 1620 v = a->a; 1621 if (usecprow) { 1622 mbs = a->compressedrow.nrows; 1623 ii = a->compressedrow.i; 1624 ridx = a->compressedrow.rindex; 1625 } else { 1626 mbs = a->mbs; 1627 ii = a->i; 1628 z = zarray; 1629 } 1630 1631 if (!a->mult_work) { 1632 k = PetscMax(A->rmap->n,A->cmap->n); 1633 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1634 } 1635 work = a->mult_work; 1636 for (i=0; i<mbs; i++) { 1637 n = ii[1] - ii[0]; ii++; 1638 ncols = n*bs; 1639 workt = work; 1640 for (j=0; j<n; j++) { 1641 xb = x + bs*(*idx++); 1642 for (k=0; k<bs; k++) workt[k] = xb[k]; 1643 workt += bs; 1644 } 1645 if (usecprow) z = zarray + bs*ridx[i]; 1646 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 1647 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 1648 v += n*bs2; 1649 if (!usecprow) z += bs; 1650 } 1651 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1652 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1653 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 1654 PetscFunctionReturn(0); 1655 } 1656 1657 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1658 { 1659 PetscScalar zero = 0.0; 1660 PetscErrorCode ierr; 1661 1662 PetscFunctionBegin; 1663 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1664 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1665 PetscFunctionReturn(0); 1666 } 1667 1668 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 1669 { 1670 PetscScalar zero = 0.0; 1671 PetscErrorCode ierr; 1672 1673 PetscFunctionBegin; 1674 ierr = VecSet(zz,zero);CHKERRQ(ierr); 1675 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 1676 PetscFunctionReturn(0); 1677 } 1678 1679 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1680 { 1681 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1682 PetscScalar *z,x1,x2,x3,x4,x5; 1683 const PetscScalar *x,*xb = NULL; 1684 const MatScalar *v; 1685 PetscErrorCode ierr; 1686 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 1687 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1688 Mat_CompressedRow cprow = a->compressedrow; 1689 PetscBool usecprow = cprow.use; 1690 1691 PetscFunctionBegin; 1692 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1693 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1694 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1695 1696 idx = a->j; 1697 v = a->a; 1698 if (usecprow) { 1699 mbs = cprow.nrows; 1700 ii = cprow.i; 1701 ridx = cprow.rindex; 1702 } else { 1703 mbs=a->mbs; 1704 ii = a->i; 1705 xb = x; 1706 } 1707 1708 switch (bs) { 1709 case 1: 1710 for (i=0; i<mbs; i++) { 1711 if (usecprow) xb = x + ridx[i]; 1712 x1 = xb[0]; 1713 ib = idx + ii[0]; 1714 n = ii[1] - ii[0]; ii++; 1715 for (j=0; j<n; j++) { 1716 rval = ib[j]; 1717 z[rval] += PetscConj(*v) * x1; 1718 v++; 1719 } 1720 if (!usecprow) xb++; 1721 } 1722 break; 1723 case 2: 1724 for (i=0; i<mbs; i++) { 1725 if (usecprow) xb = x + 2*ridx[i]; 1726 x1 = xb[0]; x2 = xb[1]; 1727 ib = idx + ii[0]; 1728 n = ii[1] - ii[0]; ii++; 1729 for (j=0; j<n; j++) { 1730 rval = ib[j]*2; 1731 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 1732 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 1733 v += 4; 1734 } 1735 if (!usecprow) xb += 2; 1736 } 1737 break; 1738 case 3: 1739 for (i=0; i<mbs; i++) { 1740 if (usecprow) xb = x + 3*ridx[i]; 1741 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1742 ib = idx + ii[0]; 1743 n = ii[1] - ii[0]; ii++; 1744 for (j=0; j<n; j++) { 1745 rval = ib[j]*3; 1746 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 1747 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 1748 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 1749 v += 9; 1750 } 1751 if (!usecprow) xb += 3; 1752 } 1753 break; 1754 case 4: 1755 for (i=0; i<mbs; i++) { 1756 if (usecprow) xb = x + 4*ridx[i]; 1757 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1758 ib = idx + ii[0]; 1759 n = ii[1] - ii[0]; ii++; 1760 for (j=0; j<n; j++) { 1761 rval = ib[j]*4; 1762 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 1763 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 1764 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 1765 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 1766 v += 16; 1767 } 1768 if (!usecprow) xb += 4; 1769 } 1770 break; 1771 case 5: 1772 for (i=0; i<mbs; i++) { 1773 if (usecprow) xb = x + 5*ridx[i]; 1774 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1775 x4 = xb[3]; x5 = xb[4]; 1776 ib = idx + ii[0]; 1777 n = ii[1] - ii[0]; ii++; 1778 for (j=0; j<n; j++) { 1779 rval = ib[j]*5; 1780 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 1781 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 1782 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 1783 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 1784 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 1785 v += 25; 1786 } 1787 if (!usecprow) xb += 5; 1788 } 1789 break; 1790 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 1791 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 1792 #if 0 1793 { 1794 PetscInt ncols,k,bs2=a->bs2; 1795 PetscScalar *work,*workt,zb; 1796 const PetscScalar *xtmp; 1797 if (!a->mult_work) { 1798 k = PetscMax(A->rmap->n,A->cmap->n); 1799 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1800 } 1801 work = a->mult_work; 1802 xtmp = x; 1803 for (i=0; i<mbs; i++) { 1804 n = ii[1] - ii[0]; ii++; 1805 ncols = n*bs; 1806 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1807 if (usecprow) xtmp = x + bs*ridx[i]; 1808 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1809 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1810 v += n*bs2; 1811 if (!usecprow) xtmp += bs; 1812 workt = work; 1813 for (j=0; j<n; j++) { 1814 zb = z + bs*(*idx++); 1815 for (k=0; k<bs; k++) zb[k] += workt[k] ; 1816 workt += bs; 1817 } 1818 } 1819 } 1820 #endif 1821 } 1822 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1823 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1824 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1825 PetscFunctionReturn(0); 1826 } 1827 1828 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 1829 { 1830 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1831 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 1832 const PetscScalar *x,*xb = 0; 1833 const MatScalar *v; 1834 PetscErrorCode ierr; 1835 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 1836 const PetscInt *idx,*ii,*ib,*ridx = NULL; 1837 Mat_CompressedRow cprow = a->compressedrow; 1838 PetscBool usecprow=cprow.use; 1839 1840 PetscFunctionBegin; 1841 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 1842 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1843 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1844 1845 idx = a->j; 1846 v = a->a; 1847 if (usecprow) { 1848 mbs = cprow.nrows; 1849 ii = cprow.i; 1850 ridx = cprow.rindex; 1851 } else { 1852 mbs=a->mbs; 1853 ii = a->i; 1854 xb = x; 1855 } 1856 1857 switch (bs) { 1858 case 1: 1859 for (i=0; i<mbs; i++) { 1860 if (usecprow) xb = x + ridx[i]; 1861 x1 = xb[0]; 1862 ib = idx + ii[0]; 1863 n = ii[1] - ii[0]; ii++; 1864 for (j=0; j<n; j++) { 1865 rval = ib[j]; 1866 z[rval] += *v * x1; 1867 v++; 1868 } 1869 if (!usecprow) xb++; 1870 } 1871 break; 1872 case 2: 1873 for (i=0; i<mbs; i++) { 1874 if (usecprow) xb = x + 2*ridx[i]; 1875 x1 = xb[0]; x2 = xb[1]; 1876 ib = idx + ii[0]; 1877 n = ii[1] - ii[0]; ii++; 1878 for (j=0; j<n; j++) { 1879 rval = ib[j]*2; 1880 z[rval++] += v[0]*x1 + v[1]*x2; 1881 z[rval++] += v[2]*x1 + v[3]*x2; 1882 v += 4; 1883 } 1884 if (!usecprow) xb += 2; 1885 } 1886 break; 1887 case 3: 1888 for (i=0; i<mbs; i++) { 1889 if (usecprow) xb = x + 3*ridx[i]; 1890 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1891 ib = idx + ii[0]; 1892 n = ii[1] - ii[0]; ii++; 1893 for (j=0; j<n; j++) { 1894 rval = ib[j]*3; 1895 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 1896 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 1897 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 1898 v += 9; 1899 } 1900 if (!usecprow) xb += 3; 1901 } 1902 break; 1903 case 4: 1904 for (i=0; i<mbs; i++) { 1905 if (usecprow) xb = x + 4*ridx[i]; 1906 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1907 ib = idx + ii[0]; 1908 n = ii[1] - ii[0]; ii++; 1909 for (j=0; j<n; j++) { 1910 rval = ib[j]*4; 1911 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1912 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1913 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1914 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1915 v += 16; 1916 } 1917 if (!usecprow) xb += 4; 1918 } 1919 break; 1920 case 5: 1921 for (i=0; i<mbs; i++) { 1922 if (usecprow) xb = x + 5*ridx[i]; 1923 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1924 x4 = xb[3]; x5 = xb[4]; 1925 ib = idx + ii[0]; 1926 n = ii[1] - ii[0]; ii++; 1927 for (j=0; j<n; j++) { 1928 rval = ib[j]*5; 1929 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1930 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1931 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1932 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1933 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1934 v += 25; 1935 } 1936 if (!usecprow) xb += 5; 1937 } 1938 break; 1939 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 1940 PetscInt ncols,k; 1941 PetscScalar *work,*workt; 1942 const PetscScalar *xtmp; 1943 if (!a->mult_work) { 1944 k = PetscMax(A->rmap->n,A->cmap->n); 1945 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1946 } 1947 work = a->mult_work; 1948 xtmp = x; 1949 for (i=0; i<mbs; i++) { 1950 n = ii[1] - ii[0]; ii++; 1951 ncols = n*bs; 1952 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 1953 if (usecprow) xtmp = x + bs*ridx[i]; 1954 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 1955 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 1956 v += n*bs2; 1957 if (!usecprow) xtmp += bs; 1958 workt = work; 1959 for (j=0; j<n; j++) { 1960 zb = z + bs*(*idx++); 1961 for (k=0; k<bs; k++) zb[k] += workt[k]; 1962 workt += bs; 1963 } 1964 } 1965 } 1966 } 1967 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1968 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1969 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 1970 PetscFunctionReturn(0); 1971 } 1972 1973 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 1974 { 1975 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 1976 PetscInt totalnz = a->bs2*a->nz; 1977 PetscScalar oalpha = alpha; 1978 PetscErrorCode ierr; 1979 PetscBLASInt one = 1,tnz; 1980 1981 PetscFunctionBegin; 1982 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 1983 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 1984 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 1985 PetscFunctionReturn(0); 1986 } 1987 1988 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 1989 { 1990 PetscErrorCode ierr; 1991 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1992 MatScalar *v = a->a; 1993 PetscReal sum = 0.0; 1994 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 1995 1996 PetscFunctionBegin; 1997 if (type == NORM_FROBENIUS) { 1998 #if defined(PETSC_USE_REAL___FP16) 1999 PetscBLASInt one = 1,cnt = bs2*nz; 2000 *norm = BLASnrm2_(&cnt,v,&one); 2001 #else 2002 for (i=0; i<bs2*nz; i++) { 2003 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2004 } 2005 #endif 2006 *norm = PetscSqrtReal(sum); 2007 ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr); 2008 } else if (type == NORM_1) { /* maximum column sum */ 2009 PetscReal *tmp; 2010 PetscInt *bcol = a->j; 2011 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2012 for (i=0; i<nz; i++) { 2013 for (j=0; j<bs; j++) { 2014 k1 = bs*(*bcol) + j; /* column index */ 2015 for (k=0; k<bs; k++) { 2016 tmp[k1] += PetscAbsScalar(*v); v++; 2017 } 2018 } 2019 bcol++; 2020 } 2021 *norm = 0.0; 2022 for (j=0; j<A->cmap->n; j++) { 2023 if (tmp[j] > *norm) *norm = tmp[j]; 2024 } 2025 ierr = PetscFree(tmp);CHKERRQ(ierr); 2026 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2027 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2028 *norm = 0.0; 2029 for (k=0; k<bs; k++) { 2030 for (j=0; j<a->mbs; j++) { 2031 v = a->a + bs2*a->i[j] + k; 2032 sum = 0.0; 2033 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2034 for (k1=0; k1<bs; k1++) { 2035 sum += PetscAbsScalar(*v); 2036 v += bs; 2037 } 2038 } 2039 if (sum > *norm) *norm = sum; 2040 } 2041 } 2042 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr); 2043 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2044 PetscFunctionReturn(0); 2045 } 2046 2047 2048 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2049 { 2050 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2051 PetscErrorCode ierr; 2052 2053 PetscFunctionBegin; 2054 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2055 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2056 *flg = PETSC_FALSE; 2057 PetscFunctionReturn(0); 2058 } 2059 2060 /* if the a->i are the same */ 2061 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2062 if (!*flg) PetscFunctionReturn(0); 2063 2064 /* if a->j are the same */ 2065 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2066 if (!*flg) PetscFunctionReturn(0); 2067 2068 /* if a->a are the same */ 2069 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2070 PetscFunctionReturn(0); 2071 2072 } 2073 2074 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2075 { 2076 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2077 PetscErrorCode ierr; 2078 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2079 PetscScalar *x,zero = 0.0; 2080 MatScalar *aa,*aa_j; 2081 2082 PetscFunctionBegin; 2083 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2084 bs = A->rmap->bs; 2085 aa = a->a; 2086 ai = a->i; 2087 aj = a->j; 2088 ambs = a->mbs; 2089 bs2 = a->bs2; 2090 2091 ierr = VecSet(v,zero);CHKERRQ(ierr); 2092 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2093 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2094 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2095 for (i=0; i<ambs; i++) { 2096 for (j=ai[i]; j<ai[i+1]; j++) { 2097 if (aj[j] == i) { 2098 row = i*bs; 2099 aa_j = aa+j*bs2; 2100 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2101 break; 2102 } 2103 } 2104 } 2105 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2106 PetscFunctionReturn(0); 2107 } 2108 2109 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2110 { 2111 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2112 const PetscScalar *l,*r,*li,*ri; 2113 PetscScalar x; 2114 MatScalar *aa, *v; 2115 PetscErrorCode ierr; 2116 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2117 const PetscInt *ai,*aj; 2118 2119 PetscFunctionBegin; 2120 ai = a->i; 2121 aj = a->j; 2122 aa = a->a; 2123 m = A->rmap->n; 2124 n = A->cmap->n; 2125 bs = A->rmap->bs; 2126 mbs = a->mbs; 2127 bs2 = a->bs2; 2128 if (ll) { 2129 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2130 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2131 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2132 for (i=0; i<mbs; i++) { /* for each block row */ 2133 M = ai[i+1] - ai[i]; 2134 li = l + i*bs; 2135 v = aa + bs2*ai[i]; 2136 for (j=0; j<M; j++) { /* for each block */ 2137 for (k=0; k<bs2; k++) { 2138 (*v++) *= li[k%bs]; 2139 } 2140 } 2141 } 2142 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2143 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2144 } 2145 2146 if (rr) { 2147 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2148 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2149 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2150 for (i=0; i<mbs; i++) { /* for each block row */ 2151 iai = ai[i]; 2152 M = ai[i+1] - iai; 2153 v = aa + bs2*iai; 2154 for (j=0; j<M; j++) { /* for each block */ 2155 ri = r + bs*aj[iai+j]; 2156 for (k=0; k<bs; k++) { 2157 x = ri[k]; 2158 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2159 v += bs; 2160 } 2161 } 2162 } 2163 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2164 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2165 } 2166 PetscFunctionReturn(0); 2167 } 2168 2169 2170 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2171 { 2172 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2173 2174 PetscFunctionBegin; 2175 info->block_size = a->bs2; 2176 info->nz_allocated = a->bs2*a->maxnz; 2177 info->nz_used = a->bs2*a->nz; 2178 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2179 info->assemblies = A->num_ass; 2180 info->mallocs = A->info.mallocs; 2181 info->memory = ((PetscObject)A)->mem; 2182 if (A->factortype) { 2183 info->fill_ratio_given = A->info.fill_ratio_given; 2184 info->fill_ratio_needed = A->info.fill_ratio_needed; 2185 info->factor_mallocs = A->info.factor_mallocs; 2186 } else { 2187 info->fill_ratio_given = 0; 2188 info->fill_ratio_needed = 0; 2189 info->factor_mallocs = 0; 2190 } 2191 PetscFunctionReturn(0); 2192 } 2193 2194 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2195 { 2196 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2197 PetscErrorCode ierr; 2198 2199 PetscFunctionBegin; 2200 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2201 PetscFunctionReturn(0); 2202 } 2203