1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc-private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 #if defined(PETSC_THREADCOMM_ACTIVE) 7 #include <petscthreadcomm.h> 8 #endif 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 #undef __FUNCT__ 77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 79 { 80 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 81 PetscErrorCode ierr; 82 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 83 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 84 const PetscInt *irow,*icol; 85 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 86 PetscInt *aj = a->j,*ai = a->i; 87 MatScalar *mat_a; 88 Mat C; 89 PetscBool flag,sorted; 90 91 PetscFunctionBegin; 92 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 93 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 94 95 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 96 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 97 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 98 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 99 100 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 101 ssmap = smap; 102 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 103 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 104 /* determine lens of each row */ 105 for (i=0; i<nrows; i++) { 106 kstart = ai[irow[i]]; 107 kend = kstart + a->ilen[irow[i]]; 108 lens[i] = 0; 109 for (k=kstart; k<kend; k++) { 110 if (ssmap[aj[k]]) lens[i]++; 111 } 112 } 113 /* Create and fill new matrix */ 114 if (scall == MAT_REUSE_MATRIX) { 115 c = (Mat_SeqBAIJ*)((*B)->data); 116 117 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 118 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 119 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 120 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 121 C = *B; 122 } else { 123 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 124 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 125 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 126 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 127 } 128 c = (Mat_SeqBAIJ*)(C->data); 129 for (i=0; i<nrows; i++) { 130 row = irow[i]; 131 kstart = ai[row]; 132 kend = kstart + a->ilen[row]; 133 mat_i = c->i[i]; 134 mat_j = c->j + mat_i; 135 mat_a = c->a + mat_i*bs2; 136 mat_ilen = c->ilen + i; 137 for (k=kstart; k<kend; k++) { 138 if ((tcol=ssmap[a->j[k]])) { 139 *mat_j++ = tcol - 1; 140 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 141 mat_a += bs2; 142 (*mat_ilen)++; 143 } 144 } 145 } 146 147 /* Free work space */ 148 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 149 ierr = PetscFree(smap);CHKERRQ(ierr); 150 ierr = PetscFree(lens);CHKERRQ(ierr); 151 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 152 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 153 154 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 155 *B = C; 156 PetscFunctionReturn(0); 157 } 158 159 #undef __FUNCT__ 160 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 161 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 162 { 163 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 164 IS is1,is2; 165 PetscErrorCode ierr; 166 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 167 const PetscInt *irow,*icol; 168 169 PetscFunctionBegin; 170 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 171 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 172 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 173 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 174 175 /* Verify if the indices corespond to each element in a block 176 and form the IS with compressed IS */ 177 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 178 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 179 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 180 count = 0; 181 for (i=0; i<a->mbs; i++) { 182 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 183 if (vary[i]==bs) iary[count++] = i; 184 } 185 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 186 187 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 188 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 189 count = 0; 190 for (i=0; i<a->mbs; i++) { 191 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 192 if (vary[i]==bs) iary[count++] = i; 193 } 194 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 195 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 196 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 197 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 198 199 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 200 ierr = ISDestroy(&is1);CHKERRQ(ierr); 201 ierr = ISDestroy(&is2);CHKERRQ(ierr); 202 PetscFunctionReturn(0); 203 } 204 205 #undef __FUNCT__ 206 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 207 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 208 { 209 PetscErrorCode ierr; 210 PetscInt i; 211 212 PetscFunctionBegin; 213 if (scall == MAT_INITIAL_MATRIX) { 214 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 215 } 216 217 for (i=0; i<n; i++) { 218 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 219 } 220 PetscFunctionReturn(0); 221 } 222 223 224 /* -------------------------------------------------------*/ 225 /* Should check that shapes of vectors and matrices match */ 226 /* -------------------------------------------------------*/ 227 228 #if defined(PETSC_THREADCOMM_ACTIVE) 229 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 230 { 231 PetscErrorCode ierr; 232 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 233 PetscScalar *z; 234 const PetscScalar *x; 235 const MatScalar *aa; 236 PetscInt *trstarts=A->rmap->trstarts; 237 PetscInt n,start,end,i; 238 const PetscInt *aj,*ai; 239 PetscScalar sum; 240 241 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 242 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 243 start = trstarts[thread_id]; 244 end = trstarts[thread_id+1]; 245 ai = a->i; 246 for (i=start; i<end; i++) { 247 n = ai[i+1] - ai[i]; 248 aj = a->j + ai[i]; 249 aa = a->a + ai[i]; 250 sum = 0.0; 251 PetscSparseDensePlusDot(sum,x,aa,aj,n); 252 z[i] = sum; 253 } 254 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 255 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 256 return 0; 257 } 258 259 #undef __FUNCT__ 260 #define __FUNCT__ "MatMult_SeqBAIJ_1" 261 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 262 { 263 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 264 PetscScalar *z,sum; 265 const PetscScalar *x; 266 const MatScalar *v; 267 PetscErrorCode ierr; 268 PetscInt mbs,i,n; 269 const PetscInt *idx,*ii,*ridx=NULL; 270 PetscBool usecprow=a->compressedrow.use; 271 272 PetscFunctionBegin; 273 274 if (usecprow) { 275 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 276 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 277 mbs = a->compressedrow.nrows; 278 ii = a->compressedrow.i; 279 ridx = a->compressedrow.rindex; 280 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 281 for (i=0; i<mbs; i++) { 282 n = ii[i+1] - ii[i]; 283 v = a->a + ii[i]; 284 idx = a->j + ii[i]; 285 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 286 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 287 sum = 0.0; 288 PetscSparseDensePlusDot(sum,x,v,idx,n); 289 z[ridx[i]] = sum; 290 } 291 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 292 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 293 } else { 294 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr); 295 } 296 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 297 PetscFunctionReturn(0); 298 } 299 #else 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatMult_SeqBAIJ_1" 302 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 303 { 304 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 305 PetscScalar *z,sum; 306 const PetscScalar *x; 307 const MatScalar *v; 308 PetscErrorCode ierr; 309 PetscInt mbs,i,n; 310 const PetscInt *idx,*ii,*ridx=NULL; 311 PetscBool usecprow=a->compressedrow.use; 312 313 PetscFunctionBegin; 314 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 315 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 316 317 if (usecprow) { 318 mbs = a->compressedrow.nrows; 319 ii = a->compressedrow.i; 320 ridx = a->compressedrow.rindex; 321 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 322 } else { 323 mbs = a->mbs; 324 ii = a->i; 325 } 326 327 for (i=0; i<mbs; i++) { 328 n = ii[1] - ii[0]; 329 v = a->a + ii[0]; 330 idx = a->j + ii[0]; 331 ii++; 332 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 333 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 334 sum = 0.0; 335 PetscSparseDensePlusDot(sum,x,v,idx,n); 336 if (usecprow) { 337 z[ridx[i]] = sum; 338 } else { 339 z[i] = sum; 340 } 341 } 342 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 343 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 344 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 #endif 348 349 #if defined(PETSC_THREADCOMM_ACTIVE) 350 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 351 { 352 PetscErrorCode ierr; 353 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 354 PetscScalar *z,x1,x2,sum1,sum2; 355 const PetscScalar *x,*xb; 356 const MatScalar *aa; 357 PetscInt *trstarts=A->rmap->trstarts; 358 PetscInt n,start,end,i,j; 359 const PetscInt *aj,*ai; 360 361 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 362 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 363 start = trstarts[thread_id] / 2; 364 end = trstarts[thread_id+1] / 2; 365 ai = a->i; 366 for (i=start; i<end; i++) { 367 n = ai[i+1] - ai[i]; 368 aj = a->j + ai[i]; 369 aa = a->a + ai[i]*4; 370 sum1 = 0.0; sum2 = 0.0; 371 for (j=0; j<n; j++) { 372 xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1]; 373 sum1 += aa[4*j]*x1 + aa[4*j+2]*x2; 374 sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2; 375 } 376 z[2*i] = sum1; z[2*i+1] = sum2; 377 } 378 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 379 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 380 return 0; 381 } 382 383 #undef __FUNCT__ 384 #define __FUNCT__ "MatMult_SeqBAIJ_2" 385 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 386 { 387 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 388 PetscScalar *z,x1,x2,sum1,sum2; 389 const PetscScalar *x,*xb; 390 const MatScalar *v; 391 PetscErrorCode ierr; 392 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 393 PetscBool usecprow=a->compressedrow.use; 394 395 PetscFunctionBegin; 396 397 if (usecprow) { 398 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 399 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 400 mbs = a->compressedrow.nrows; 401 ii = a->compressedrow.i; 402 ridx = a->compressedrow.rindex; 403 for (i=0; i<mbs; i++) { 404 n = ii[i+1] - ii[i]; 405 idx = a->j + ii[i]; 406 v = a->a + ii[i]*4; 407 sum1 = 0.0; sum2 = 0.0; 408 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 409 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 410 for (j=0; j<n; j++) { 411 xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1]; 412 sum1 += v[4*j]*x1 + v[4*j+2]*x2; 413 sum2 += v[4*j+1]*x1 + v[4*j+3]*x2; 414 } 415 z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2; 416 } 417 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 418 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 419 } else { 420 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr); 421 } 422 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 423 PetscFunctionReturn(0); 424 } 425 #else 426 #undef __FUNCT__ 427 #define __FUNCT__ "MatMult_SeqBAIJ_2" 428 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 429 { 430 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 431 PetscScalar *z = 0,sum1,sum2,*zarray; 432 const PetscScalar *x,*xb; 433 PetscScalar x1,x2; 434 const MatScalar *v; 435 PetscErrorCode ierr; 436 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 437 PetscBool usecprow=a->compressedrow.use; 438 439 PetscFunctionBegin; 440 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 441 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 442 443 idx = a->j; 444 v = a->a; 445 if (usecprow) { 446 mbs = a->compressedrow.nrows; 447 ii = a->compressedrow.i; 448 ridx = a->compressedrow.rindex; 449 } else { 450 mbs = a->mbs; 451 ii = a->i; 452 z = zarray; 453 } 454 455 for (i=0; i<mbs; i++) { 456 n = ii[1] - ii[0]; ii++; 457 sum1 = 0.0; sum2 = 0.0; 458 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 459 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 460 for (j=0; j<n; j++) { 461 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 462 sum1 += v[0]*x1 + v[2]*x2; 463 sum2 += v[1]*x1 + v[3]*x2; 464 v += 4; 465 } 466 if (usecprow) z = zarray + 2*ridx[i]; 467 z[0] = sum1; z[1] = sum2; 468 if (!usecprow) z += 2; 469 } 470 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 471 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 472 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 473 PetscFunctionReturn(0); 474 } 475 #endif 476 477 #if defined(PETSC_THREADCOMM_ACTIVE) 478 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 479 { 480 PetscErrorCode ierr; 481 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 482 PetscScalar *z,x1,x2,x3,sum1,sum2,sum3; 483 const PetscScalar *x,*xb; 484 const MatScalar *aa; 485 PetscInt *trstarts=A->rmap->trstarts; 486 PetscInt n,start,end,i,j; 487 const PetscInt *aj,*ai; 488 489 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 490 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 491 start = trstarts[thread_id] / 3; 492 end = trstarts[thread_id+1] / 3; 493 ai = a->i; 494 for (i=start; i<end; i++) { 495 n = ai[i+1] - ai[i]; 496 aj = a->j + ai[i]; 497 aa = a->a + ai[i]*9; 498 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 499 for (j=0; j<n; j++) { 500 xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 501 sum1 += aa[9*j]*x1 + aa[9*j+3]*x2 + aa[9*j+6]*x3; 502 sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3; 503 sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3; 504 } 505 z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3; 506 } 507 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 508 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 509 return 0; 510 } 511 512 #undef __FUNCT__ 513 #define __FUNCT__ "MatMult_SeqBAIJ_3" 514 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 515 { 516 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 517 PetscScalar *z,sum1,sum2,sum3,x1,x2,x3; 518 const PetscScalar *x,*xb; 519 const MatScalar *v; 520 PetscErrorCode ierr; 521 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 522 PetscBool usecprow=a->compressedrow.use; 523 524 525 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 526 #pragma disjoint(*v,*z,*xb) 527 #endif 528 529 PetscFunctionBegin; 530 531 if (usecprow) { 532 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 533 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 534 mbs = a->compressedrow.nrows; 535 ii = a->compressedrow.i; 536 ridx = a->compressedrow.rindex; 537 for (i=0; i<mbs; i++) { 538 n = ii[i+1] - ii[i]; 539 idx = a->j + ii[i]; 540 v = a->a + ii[i]*9; 541 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 542 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 543 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 544 for (j=0; j<n; j++) { 545 xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 546 sum1 += v[9*j]*x1 + v[9*j+3]*x2 + v[9*j+6]*x3; 547 sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3; 548 sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3; 549 } 550 z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3; 551 } 552 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 553 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 554 } else { 555 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr); 556 } 557 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 558 PetscFunctionReturn(0); 559 } 560 #else 561 #undef __FUNCT__ 562 #define __FUNCT__ "MatMult_SeqBAIJ_3" 563 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 564 { 565 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 566 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 567 const PetscScalar *x,*xb; 568 const MatScalar *v; 569 PetscErrorCode ierr; 570 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 571 PetscBool usecprow=a->compressedrow.use; 572 573 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 574 #pragma disjoint(*v,*z,*xb) 575 #endif 576 577 PetscFunctionBegin; 578 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 579 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 580 581 idx = a->j; 582 v = a->a; 583 if (usecprow) { 584 mbs = a->compressedrow.nrows; 585 ii = a->compressedrow.i; 586 ridx = a->compressedrow.rindex; 587 } else { 588 mbs = a->mbs; 589 ii = a->i; 590 z = zarray; 591 } 592 593 for (i=0; i<mbs; i++) { 594 n = ii[1] - ii[0]; ii++; 595 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 596 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 597 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 598 for (j=0; j<n; j++) { 599 xb = x + 3*(*idx++); 600 x1 = xb[0]; 601 x2 = xb[1]; 602 x3 = xb[2]; 603 604 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 605 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 606 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 607 v += 9; 608 } 609 if (usecprow) z = zarray + 3*ridx[i]; 610 z[0] = sum1; z[1] = sum2; z[2] = sum3; 611 if (!usecprow) z += 3; 612 } 613 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 614 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 615 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 616 PetscFunctionReturn(0); 617 } 618 #endif 619 620 #if defined(PETSC_THREADCOMM_ACTIVE) 621 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 622 { 623 PetscErrorCode ierr; 624 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 625 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 626 const PetscScalar *x,*xb; 627 const MatScalar *aa; 628 PetscInt *trstarts=A->rmap->trstarts; 629 PetscInt n,start,end,i,j; 630 const PetscInt *aj,*ai; 631 632 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 633 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 634 start = trstarts[thread_id] / 4; 635 end = trstarts[thread_id+1] / 4; 636 ai = a->i; 637 for (i=start; i<end; i++) { 638 n = ai[i+1] - ai[i]; 639 aj = a->j + ai[i]; 640 aa = a->a + ai[i]*16; 641 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 642 for (j=0; j<n; j++) { 643 xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 644 sum1 += aa[16*j]*x1 + aa[16*j+4]*x2 + aa[16*j+8]*x3 + aa[16*j+12]*x4; 645 sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3 + aa[16*j+13]*x4; 646 sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4; 647 sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4; 648 } 649 z[4*i] = sum1; z[4*i+1] = sum2; 650 z[4*i+2] = sum3; z[4*i+3] = sum4; 651 } 652 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 653 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 654 return 0; 655 } 656 657 #undef __FUNCT__ 658 #define __FUNCT__ "MatMult_SeqBAIJ_4" 659 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 660 { 661 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 662 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 663 const PetscScalar *x,*xb; 664 const MatScalar *v; 665 PetscErrorCode ierr; 666 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 667 PetscBool usecprow=a->compressedrow.use; 668 669 PetscFunctionBegin; 670 671 if (usecprow) { 672 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 673 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 674 mbs = a->compressedrow.nrows; 675 ii = a->compressedrow.i; 676 ridx = a->compressedrow.rindex; 677 for (i=0; i<mbs; i++) { 678 n = ii[i+1] - ii[1]; 679 idx = a->j + ii[i]; 680 v = a->a + ii[i]*16; 681 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 682 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 683 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 684 for (j=0; j<n; j++) { 685 xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 686 sum1 += v[16*j]*x1 + v[16*j+4]*x2 + v[16*j+8]*x3 + v[16*j+12]*x4; 687 sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3 + v[16*j+13]*x4; 688 sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4; 689 sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4; 690 } 691 z[4*ridx[i]] = sum1; z[4*ridx[i]+1] = sum2; 692 z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4; 693 } 694 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 695 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 696 } else { 697 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr); 698 } 699 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 700 PetscFunctionReturn(0); 701 } 702 #else 703 #undef __FUNCT__ 704 #define __FUNCT__ "MatMult_SeqBAIJ_4" 705 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 706 { 707 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 708 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 709 const PetscScalar *x,*xb; 710 const MatScalar *v; 711 PetscErrorCode ierr; 712 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 713 PetscBool usecprow=a->compressedrow.use; 714 715 PetscFunctionBegin; 716 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 717 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 718 719 idx = a->j; 720 v = a->a; 721 if (usecprow) { 722 mbs = a->compressedrow.nrows; 723 ii = a->compressedrow.i; 724 ridx = a->compressedrow.rindex; 725 } else { 726 mbs = a->mbs; 727 ii = a->i; 728 z = zarray; 729 } 730 731 for (i=0; i<mbs; i++) { 732 n = ii[1] - ii[0]; 733 ii++; 734 sum1 = 0.0; 735 sum2 = 0.0; 736 sum3 = 0.0; 737 sum4 = 0.0; 738 739 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 740 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 741 for (j=0; j<n; j++) { 742 xb = x + 4*(*idx++); 743 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 744 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 745 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 746 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 747 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 748 v += 16; 749 } 750 if (usecprow) z = zarray + 4*ridx[i]; 751 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 752 if (!usecprow) z += 4; 753 } 754 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 755 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 756 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 757 PetscFunctionReturn(0); 758 } 759 #endif 760 761 #if defined(PETSC_THREADCOMM_ACTIVE) 762 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 763 { 764 PetscErrorCode ierr; 765 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 766 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 767 const PetscScalar *x,*xb; 768 const MatScalar *aa; 769 PetscInt *trstarts=A->rmap->trstarts; 770 PetscInt n,start,end,i,j; 771 const PetscInt *aj,*ai; 772 773 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 774 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 775 start = trstarts[thread_id] / 5; 776 end = trstarts[thread_id+1] / 5; 777 ai = a->i; 778 for (i=start; i<end; i++) { 779 n = ai[i+1] - ai[i]; 780 aj = a->j + ai[i]; 781 aa = a->a + ai[i]*25; 782 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 783 for (j=0; j<n; j++) { 784 xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 785 sum1 += aa[25*j]*x1 + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5; 786 sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5; 787 sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5; 788 sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5; 789 sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5; 790 } 791 z[5*i] = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3; 792 z[5*i+3] = sum4; z[5*i+4] = sum5; 793 } 794 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 795 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 796 return 0; 797 } 798 799 #undef __FUNCT__ 800 #define __FUNCT__ "MatMult_SeqBAIJ_5" 801 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 802 { 803 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 804 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 805 const PetscScalar *xb,*x; 806 const MatScalar *v; 807 PetscErrorCode ierr; 808 const PetscInt *idx,*ii,*ridx=NULL; 809 PetscInt mbs,i,j,n; 810 PetscBool usecprow=a->compressedrow.use; 811 812 PetscFunctionBegin; 813 814 if (usecprow) { 815 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 816 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 817 mbs = a->compressedrow.nrows; 818 ii = a->compressedrow.i; 819 ridx = a->compressedrow.rindex; 820 for (i=0; i<mbs; i++) { 821 n = ii[i+1] - ii[i]; 822 idx = a->j + ii[i]; 823 v = a->a + ii[i]*25; 824 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 825 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 826 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 827 for (j=0; j<n; j++) { 828 xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 829 sum1 += v[25*j]*x1 + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5; 830 sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5; 831 sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5; 832 sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5; 833 sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5; 834 } 835 z[5*ridx[i]] = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3; 836 z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5; 837 } 838 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 839 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 840 } else { 841 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr); 842 } 843 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 844 PetscFunctionReturn(0); 845 } 846 #else 847 #undef __FUNCT__ 848 #define __FUNCT__ "MatMult_SeqBAIJ_5" 849 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 850 { 851 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 852 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 853 const PetscScalar *xb,*x; 854 const MatScalar *v; 855 PetscErrorCode ierr; 856 const PetscInt *idx,*ii,*ridx=NULL; 857 PetscInt mbs,i,j,n; 858 PetscBool usecprow=a->compressedrow.use; 859 860 PetscFunctionBegin; 861 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 862 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 863 864 idx = a->j; 865 v = a->a; 866 if (usecprow) { 867 mbs = a->compressedrow.nrows; 868 ii = a->compressedrow.i; 869 ridx = a->compressedrow.rindex; 870 } else { 871 mbs = a->mbs; 872 ii = a->i; 873 z = zarray; 874 } 875 876 for (i=0; i<mbs; i++) { 877 n = ii[1] - ii[0]; ii++; 878 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 879 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 880 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 881 for (j=0; j<n; j++) { 882 xb = x + 5*(*idx++); 883 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 884 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 885 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 886 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 887 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 888 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 889 v += 25; 890 } 891 if (usecprow) z = zarray + 5*ridx[i]; 892 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 893 if (!usecprow) z += 5; 894 } 895 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 896 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 897 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 898 PetscFunctionReturn(0); 899 } 900 #endif 901 902 903 #if defined(PETSC_THREADCOMM_ACTIVE) 904 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 905 { 906 PetscErrorCode ierr; 907 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 908 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 909 const PetscScalar *x,*xb; 910 const MatScalar *aa; 911 PetscInt *trstarts=A->rmap->trstarts; 912 PetscInt n,start,end,i,j; 913 const PetscInt *aj,*ai; 914 915 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 916 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 917 start = trstarts[thread_id] / 6; 918 end = trstarts[thread_id+1] / 6; 919 ai = a->i; 920 for (i=start; i<end; i++) { 921 n = ai[i+1] - ai[i]; 922 aj = a->j + ai[i]; 923 aa = a->a + ai[i]*36; 924 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 925 for (j=0; j<n; j++) { 926 xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 927 sum1 += aa[36*j]*x1 + aa[36*j+6]*x2 + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6; 928 sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2 + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6; 929 sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2 + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6; 930 sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2 + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6; 931 sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6; 932 sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6; 933 } 934 z[6*i] = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3; 935 z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6; 936 } 937 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 938 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 939 return 0; 940 } 941 942 #undef __FUNCT__ 943 #define __FUNCT__ "MatMult_SeqBAIJ_6" 944 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 945 { 946 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 947 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 948 const PetscScalar *x,*xb; 949 const MatScalar *v; 950 PetscErrorCode ierr; 951 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 952 PetscBool usecprow=a->compressedrow.use; 953 954 PetscFunctionBegin; 955 956 if (usecprow) { 957 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 958 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 959 mbs = a->compressedrow.nrows; 960 ii = a->compressedrow.i; 961 ridx = a->compressedrow.rindex; 962 for (i=0; i<mbs; i++) { 963 n = ii[i+1] - ii[i]; 964 idx = a->j + ii[i]; 965 v = a->a + ii[i]*36; 966 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 967 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 968 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 969 for (j=0; j<n; j++) { 970 xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 971 sum1 += v[36*j]*x1 + v[36*j+6]*x2 + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6; 972 sum2 += v[36*j+1]*x1 + v[36*j+7]*x2 + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6; 973 sum3 += v[36*j+2]*x1 + v[36*j+8]*x2 + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6; 974 sum4 += v[36*j+3]*x1 + v[36*j+9]*x2 + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6; 975 sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6; 976 sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6; 977 } 978 z[6*ridx[i]] = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3; 979 z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6; 980 } 981 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 982 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 983 } else { 984 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr); 985 } 986 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 987 PetscFunctionReturn(0); 988 } 989 #else 990 #undef __FUNCT__ 991 #define __FUNCT__ "MatMult_SeqBAIJ_6" 992 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 993 { 994 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 995 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 996 const PetscScalar *x,*xb; 997 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 998 const MatScalar *v; 999 PetscErrorCode ierr; 1000 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1001 PetscBool usecprow=a->compressedrow.use; 1002 1003 PetscFunctionBegin; 1004 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1005 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1006 1007 idx = a->j; 1008 v = a->a; 1009 if (usecprow) { 1010 mbs = a->compressedrow.nrows; 1011 ii = a->compressedrow.i; 1012 ridx = a->compressedrow.rindex; 1013 } else { 1014 mbs = a->mbs; 1015 ii = a->i; 1016 z = zarray; 1017 } 1018 1019 for (i=0; i<mbs; i++) { 1020 n = ii[1] - ii[0]; 1021 ii++; 1022 sum1 = 0.0; 1023 sum2 = 0.0; 1024 sum3 = 0.0; 1025 sum4 = 0.0; 1026 sum5 = 0.0; 1027 sum6 = 0.0; 1028 1029 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1030 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1031 for (j=0; j<n; j++) { 1032 xb = x + 6*(*idx++); 1033 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1034 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1035 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1036 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1037 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1038 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1039 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1040 v += 36; 1041 } 1042 if (usecprow) z = zarray + 6*ridx[i]; 1043 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1044 if (!usecprow) z += 6; 1045 } 1046 1047 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1048 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1049 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 1050 PetscFunctionReturn(0); 1051 } 1052 #endif 1053 1054 #if defined(PETSC_THREADCOMM_ACTIVE) 1055 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 1056 { 1057 PetscErrorCode ierr; 1058 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1059 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1060 const PetscScalar *x,*xb; 1061 const MatScalar *aa; 1062 PetscInt *trstarts=A->rmap->trstarts; 1063 PetscInt n,start,end,i,j; 1064 const PetscInt *aj,*ai; 1065 1066 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1067 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1068 start = trstarts[thread_id] / 7; 1069 end = trstarts[thread_id+1] / 7; 1070 ai = a->i; 1071 for (i=start; i<end; i++) { 1072 n = ai[i+1] - ai[i]; 1073 aj = a->j + ai[i]; 1074 aa = a->a + ai[i]*49; 1075 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1076 for (j=0; j<n; j++) { 1077 xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1078 sum1 += aa[49*j]*x1 + aa[49*j+7]*x2 + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7; 1079 sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2 + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7; 1080 sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2 + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7; 1081 sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7; 1082 sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7; 1083 sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7; 1084 sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7; 1085 } 1086 z[7*i] = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4; 1087 z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7; 1088 } 1089 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1090 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1091 return 0; 1092 } 1093 1094 #undef __FUNCT__ 1095 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1096 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1097 { 1098 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1099 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1100 const PetscScalar *x,*xb; 1101 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1102 const MatScalar *v; 1103 PetscErrorCode ierr; 1104 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1105 PetscBool usecprow=a->compressedrow.use; 1106 1107 PetscFunctionBegin; 1108 1109 if (usecprow) { 1110 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1111 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1112 mbs = a->compressedrow.nrows; 1113 ii = a->compressedrow.i; 1114 ridx = a->compressedrow.rindex; 1115 for (i=0; i<mbs; i++) { 1116 n = ii[i+1] - ii[i]; 1117 idx = a->j + ii[i]; 1118 v = a->a + ii[i]*49; 1119 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1120 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1121 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1122 for (j=0; j<n; j++) { 1123 xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1124 sum1 += v[49*j]*x1 + v[49*j+7]*x2 + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7; 1125 sum2 += v[49*j+1]*x1 + v[49*j+8]*x2 + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7; 1126 sum3 += v[49*j+2]*x1 + v[49*j+9]*x2 + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7; 1127 sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7; 1128 sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7; 1129 sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7; 1130 sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7; 1131 } 1132 z[7*ridx[i]] = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4; 1133 z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7; 1134 } 1135 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1136 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1137 } else { 1138 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr); 1139 } 1140 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1141 PetscFunctionReturn(0); 1142 } 1143 #else 1144 #undef __FUNCT__ 1145 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1146 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1147 { 1148 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1149 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1150 const PetscScalar *x,*xb; 1151 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1152 const MatScalar *v; 1153 PetscErrorCode ierr; 1154 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1155 PetscBool usecprow=a->compressedrow.use; 1156 1157 PetscFunctionBegin; 1158 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1159 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1160 1161 idx = a->j; 1162 v = a->a; 1163 if (usecprow) { 1164 mbs = a->compressedrow.nrows; 1165 ii = a->compressedrow.i; 1166 ridx = a->compressedrow.rindex; 1167 } else { 1168 mbs = a->mbs; 1169 ii = a->i; 1170 z = zarray; 1171 } 1172 1173 for (i=0; i<mbs; i++) { 1174 n = ii[1] - ii[0]; 1175 ii++; 1176 sum1 = 0.0; 1177 sum2 = 0.0; 1178 sum3 = 0.0; 1179 sum4 = 0.0; 1180 sum5 = 0.0; 1181 sum6 = 0.0; 1182 sum7 = 0.0; 1183 1184 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1185 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1186 for (j=0; j<n; j++) { 1187 xb = x + 7*(*idx++); 1188 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1189 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1190 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1191 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1192 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1193 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1194 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1195 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1196 v += 49; 1197 } 1198 if (usecprow) z = zarray + 7*ridx[i]; 1199 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1200 if (!usecprow) z += 7; 1201 } 1202 1203 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1204 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1205 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1206 PetscFunctionReturn(0); 1207 } 1208 #endif 1209 1210 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1211 /* Default MatMult for block size 15 */ 1212 1213 #undef __FUNCT__ 1214 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 1215 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1216 { 1217 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1218 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1219 const PetscScalar *x,*xb; 1220 PetscScalar *zarray,xv; 1221 const MatScalar *v; 1222 PetscErrorCode ierr; 1223 const PetscInt *ii,*ij=a->j,*idx; 1224 PetscInt mbs,i,j,k,n,*ridx=NULL; 1225 PetscBool usecprow=a->compressedrow.use; 1226 1227 PetscFunctionBegin; 1228 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1229 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1230 1231 v = a->a; 1232 if (usecprow) { 1233 mbs = a->compressedrow.nrows; 1234 ii = a->compressedrow.i; 1235 ridx = a->compressedrow.rindex; 1236 } else { 1237 mbs = a->mbs; 1238 ii = a->i; 1239 z = zarray; 1240 } 1241 1242 for (i=0; i<mbs; i++) { 1243 n = ii[i+1] - ii[i]; 1244 idx = ij + ii[i]; 1245 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1246 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1247 1248 for (j=0; j<n; j++) { 1249 xb = x + 15*(idx[j]); 1250 1251 for (k=0; k<15; k++) { 1252 xv = xb[k]; 1253 sum1 += v[0]*xv; 1254 sum2 += v[1]*xv; 1255 sum3 += v[2]*xv; 1256 sum4 += v[3]*xv; 1257 sum5 += v[4]*xv; 1258 sum6 += v[5]*xv; 1259 sum7 += v[6]*xv; 1260 sum8 += v[7]*xv; 1261 sum9 += v[8]*xv; 1262 sum10 += v[9]*xv; 1263 sum11 += v[10]*xv; 1264 sum12 += v[11]*xv; 1265 sum13 += v[12]*xv; 1266 sum14 += v[13]*xv; 1267 sum15 += v[14]*xv; 1268 v += 15; 1269 } 1270 } 1271 if (usecprow) z = zarray + 15*ridx[i]; 1272 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1273 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1274 1275 if (!usecprow) z += 15; 1276 } 1277 1278 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1279 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1280 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1281 PetscFunctionReturn(0); 1282 } 1283 1284 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1285 #undef __FUNCT__ 1286 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 1287 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1288 { 1289 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1290 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1291 const PetscScalar *x,*xb; 1292 PetscScalar x1,x2,x3,x4,*zarray; 1293 const MatScalar *v; 1294 PetscErrorCode ierr; 1295 const PetscInt *ii,*ij=a->j,*idx; 1296 PetscInt mbs,i,j,n,*ridx=NULL; 1297 PetscBool usecprow=a->compressedrow.use; 1298 1299 PetscFunctionBegin; 1300 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1301 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1302 1303 v = a->a; 1304 if (usecprow) { 1305 mbs = a->compressedrow.nrows; 1306 ii = a->compressedrow.i; 1307 ridx = a->compressedrow.rindex; 1308 } else { 1309 mbs = a->mbs; 1310 ii = a->i; 1311 z = zarray; 1312 } 1313 1314 for (i=0; i<mbs; i++) { 1315 n = ii[i+1] - ii[i]; 1316 idx = ij + ii[i]; 1317 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1318 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1319 1320 for (j=0; j<n; j++) { 1321 xb = x + 15*(idx[j]); 1322 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1323 1324 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1325 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1326 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1327 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1328 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1329 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1330 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1331 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1332 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1333 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1334 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1335 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1336 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1337 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1338 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1339 1340 v += 60; 1341 1342 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1343 1344 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1345 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1346 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1347 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1348 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1349 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1350 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1351 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1352 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1353 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1354 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1355 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1356 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1357 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1358 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1359 v += 60; 1360 1361 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1362 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1363 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1364 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1365 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1366 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1367 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1368 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1369 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1370 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1371 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1372 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1373 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1374 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1375 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1376 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1377 v += 60; 1378 1379 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1380 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1381 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1382 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1383 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1384 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1385 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1386 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1387 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1388 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1389 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1390 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1391 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1392 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1393 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1394 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1395 v += 45; 1396 } 1397 if (usecprow) z = zarray + 15*ridx[i]; 1398 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1399 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1400 1401 if (!usecprow) z += 15; 1402 } 1403 1404 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1405 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1406 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1407 PetscFunctionReturn(0); 1408 } 1409 1410 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1411 #undef __FUNCT__ 1412 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 1413 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1414 { 1415 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1416 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1417 const PetscScalar *x,*xb; 1418 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1419 const MatScalar *v; 1420 PetscErrorCode ierr; 1421 const PetscInt *ii,*ij=a->j,*idx; 1422 PetscInt mbs,i,j,n,*ridx=NULL; 1423 PetscBool usecprow=a->compressedrow.use; 1424 1425 PetscFunctionBegin; 1426 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1427 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1428 1429 v = a->a; 1430 if (usecprow) { 1431 mbs = a->compressedrow.nrows; 1432 ii = a->compressedrow.i; 1433 ridx = a->compressedrow.rindex; 1434 } else { 1435 mbs = a->mbs; 1436 ii = a->i; 1437 z = zarray; 1438 } 1439 1440 for (i=0; i<mbs; i++) { 1441 n = ii[i+1] - ii[i]; 1442 idx = ij + ii[i]; 1443 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1444 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1445 1446 for (j=0; j<n; j++) { 1447 xb = x + 15*(idx[j]); 1448 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1449 x8 = xb[7]; 1450 1451 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1452 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1453 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1454 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1455 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1456 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1457 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1458 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1459 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1460 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1461 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1462 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1463 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1464 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1465 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1466 v += 120; 1467 1468 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1469 1470 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1471 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1472 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1473 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1474 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1475 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1476 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1477 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1478 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1479 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1480 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1481 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1482 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1483 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1484 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1485 v += 105; 1486 } 1487 if (usecprow) z = zarray + 15*ridx[i]; 1488 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1489 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1490 1491 if (!usecprow) z += 15; 1492 } 1493 1494 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1495 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1496 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1497 PetscFunctionReturn(0); 1498 } 1499 1500 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1501 1502 #undef __FUNCT__ 1503 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 1504 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1505 { 1506 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1507 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1508 const PetscScalar *x,*xb; 1509 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1510 const MatScalar *v; 1511 PetscErrorCode ierr; 1512 const PetscInt *ii,*ij=a->j,*idx; 1513 PetscInt mbs,i,j,n,*ridx=NULL; 1514 PetscBool usecprow=a->compressedrow.use; 1515 1516 PetscFunctionBegin; 1517 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1518 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1519 1520 v = a->a; 1521 if (usecprow) { 1522 mbs = a->compressedrow.nrows; 1523 ii = a->compressedrow.i; 1524 ridx = a->compressedrow.rindex; 1525 } else { 1526 mbs = a->mbs; 1527 ii = a->i; 1528 z = zarray; 1529 } 1530 1531 for (i=0; i<mbs; i++) { 1532 n = ii[i+1] - ii[i]; 1533 idx = ij + ii[i]; 1534 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1535 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1536 1537 for (j=0; j<n; j++) { 1538 xb = x + 15*(idx[j]); 1539 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1540 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1541 1542 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1543 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1544 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1545 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1546 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1547 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1548 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1549 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1550 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1551 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1552 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1553 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1554 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1555 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1556 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1557 v += 225; 1558 } 1559 if (usecprow) z = zarray + 15*ridx[i]; 1560 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1561 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1562 1563 if (!usecprow) z += 15; 1564 } 1565 1566 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1567 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1568 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1569 PetscFunctionReturn(0); 1570 } 1571 1572 1573 /* 1574 This will not work with MatScalar == float because it calls the BLAS 1575 */ 1576 #undef __FUNCT__ 1577 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1578 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1579 { 1580 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1581 PetscScalar *z = 0,*work,*workt,*zarray; 1582 const PetscScalar *x,*xb; 1583 const MatScalar *v; 1584 PetscErrorCode ierr; 1585 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1586 const PetscInt *idx,*ii,*ridx=NULL; 1587 PetscInt ncols,k; 1588 PetscBool usecprow=a->compressedrow.use; 1589 1590 PetscFunctionBegin; 1591 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1592 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1593 1594 idx = a->j; 1595 v = a->a; 1596 if (usecprow) { 1597 mbs = a->compressedrow.nrows; 1598 ii = a->compressedrow.i; 1599 ridx = a->compressedrow.rindex; 1600 } else { 1601 mbs = a->mbs; 1602 ii = a->i; 1603 z = zarray; 1604 } 1605 1606 if (!a->mult_work) { 1607 k = PetscMax(A->rmap->n,A->cmap->n); 1608 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1609 } 1610 work = a->mult_work; 1611 for (i=0; i<mbs; i++) { 1612 n = ii[1] - ii[0]; ii++; 1613 ncols = n*bs; 1614 workt = work; 1615 for (j=0; j<n; j++) { 1616 xb = x + bs*(*idx++); 1617 for (k=0; k<bs; k++) workt[k] = xb[k]; 1618 workt += bs; 1619 } 1620 if (usecprow) z = zarray + bs*ridx[i]; 1621 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1622 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1623 v += n*bs2; 1624 if (!usecprow) z += bs; 1625 } 1626 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1627 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1628 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1629 PetscFunctionReturn(0); 1630 } 1631 1632 #undef __FUNCT__ 1633 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1634 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1635 { 1636 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1637 const PetscScalar *x; 1638 PetscScalar *y,*z,sum; 1639 const MatScalar *v; 1640 PetscErrorCode ierr; 1641 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1642 const PetscInt *idx,*ii; 1643 PetscBool usecprow=a->compressedrow.use; 1644 1645 PetscFunctionBegin; 1646 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1647 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1648 1649 idx = a->j; 1650 v = a->a; 1651 if (usecprow) { 1652 if (zz != yy) { 1653 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1654 } 1655 mbs = a->compressedrow.nrows; 1656 ii = a->compressedrow.i; 1657 ridx = a->compressedrow.rindex; 1658 } else { 1659 ii = a->i; 1660 } 1661 1662 for (i=0; i<mbs; i++) { 1663 n = ii[1] - ii[0]; 1664 ii++; 1665 if (!usecprow) { 1666 sum = y[i]; 1667 } else { 1668 sum = y[ridx[i]]; 1669 } 1670 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1671 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1672 PetscSparseDensePlusDot(sum,x,v,idx,n); 1673 v += n; 1674 idx += n; 1675 if (usecprow) { 1676 z[ridx[i]] = sum; 1677 } else { 1678 z[i] = sum; 1679 } 1680 } 1681 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1682 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1683 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1684 PetscFunctionReturn(0); 1685 } 1686 1687 #undef __FUNCT__ 1688 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1689 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1690 { 1691 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1692 PetscScalar *y = 0,*z = 0,sum1,sum2; 1693 const PetscScalar *x,*xb; 1694 PetscScalar x1,x2,*yarray,*zarray; 1695 const MatScalar *v; 1696 PetscErrorCode ierr; 1697 PetscInt mbs = a->mbs,i,n,j; 1698 const PetscInt *idx,*ii,*ridx = NULL; 1699 PetscBool usecprow = a->compressedrow.use; 1700 1701 PetscFunctionBegin; 1702 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1703 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1704 1705 idx = a->j; 1706 v = a->a; 1707 if (usecprow) { 1708 if (zz != yy) { 1709 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1710 } 1711 mbs = a->compressedrow.nrows; 1712 ii = a->compressedrow.i; 1713 ridx = a->compressedrow.rindex; 1714 if (zz != yy) { 1715 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1716 } 1717 } else { 1718 ii = a->i; 1719 y = yarray; 1720 z = zarray; 1721 } 1722 1723 for (i=0; i<mbs; i++) { 1724 n = ii[1] - ii[0]; ii++; 1725 if (usecprow) { 1726 z = zarray + 2*ridx[i]; 1727 y = yarray + 2*ridx[i]; 1728 } 1729 sum1 = y[0]; sum2 = y[1]; 1730 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1731 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1732 for (j=0; j<n; j++) { 1733 xb = x + 2*(*idx++); 1734 x1 = xb[0]; 1735 x2 = xb[1]; 1736 1737 sum1 += v[0]*x1 + v[2]*x2; 1738 sum2 += v[1]*x1 + v[3]*x2; 1739 v += 4; 1740 } 1741 z[0] = sum1; z[1] = sum2; 1742 if (!usecprow) { 1743 z += 2; y += 2; 1744 } 1745 } 1746 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1747 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1748 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1749 PetscFunctionReturn(0); 1750 } 1751 1752 #undef __FUNCT__ 1753 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1754 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1755 { 1756 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1757 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1758 const PetscScalar *x,*xb; 1759 const MatScalar *v; 1760 PetscErrorCode ierr; 1761 PetscInt mbs = a->mbs,i,j,n; 1762 const PetscInt *idx,*ii,*ridx = NULL; 1763 PetscBool usecprow = a->compressedrow.use; 1764 1765 PetscFunctionBegin; 1766 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1767 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1768 1769 idx = a->j; 1770 v = a->a; 1771 if (usecprow) { 1772 if (zz != yy) { 1773 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1774 } 1775 mbs = a->compressedrow.nrows; 1776 ii = a->compressedrow.i; 1777 ridx = a->compressedrow.rindex; 1778 } else { 1779 ii = a->i; 1780 y = yarray; 1781 z = zarray; 1782 } 1783 1784 for (i=0; i<mbs; i++) { 1785 n = ii[1] - ii[0]; ii++; 1786 if (usecprow) { 1787 z = zarray + 3*ridx[i]; 1788 y = yarray + 3*ridx[i]; 1789 } 1790 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1791 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1792 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1793 for (j=0; j<n; j++) { 1794 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1795 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1796 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1797 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1798 v += 9; 1799 } 1800 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1801 if (!usecprow) { 1802 z += 3; y += 3; 1803 } 1804 } 1805 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1806 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1807 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1808 PetscFunctionReturn(0); 1809 } 1810 1811 #undef __FUNCT__ 1812 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1813 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1814 { 1815 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1816 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1817 const PetscScalar *x,*xb; 1818 const MatScalar *v; 1819 PetscErrorCode ierr; 1820 PetscInt mbs = a->mbs,i,j,n; 1821 const PetscInt *idx,*ii,*ridx=NULL; 1822 PetscBool usecprow=a->compressedrow.use; 1823 1824 PetscFunctionBegin; 1825 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1826 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1827 1828 idx = a->j; 1829 v = a->a; 1830 if (usecprow) { 1831 if (zz != yy) { 1832 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1833 } 1834 mbs = a->compressedrow.nrows; 1835 ii = a->compressedrow.i; 1836 ridx = a->compressedrow.rindex; 1837 } else { 1838 ii = a->i; 1839 y = yarray; 1840 z = zarray; 1841 } 1842 1843 for (i=0; i<mbs; i++) { 1844 n = ii[1] - ii[0]; ii++; 1845 if (usecprow) { 1846 z = zarray + 4*ridx[i]; 1847 y = yarray + 4*ridx[i]; 1848 } 1849 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1850 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1851 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1852 for (j=0; j<n; j++) { 1853 xb = x + 4*(*idx++); 1854 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1855 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1856 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1857 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1858 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1859 v += 16; 1860 } 1861 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1862 if (!usecprow) { 1863 z += 4; y += 4; 1864 } 1865 } 1866 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1867 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1868 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1869 PetscFunctionReturn(0); 1870 } 1871 1872 #undef __FUNCT__ 1873 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1874 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1875 { 1876 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1877 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1878 const PetscScalar *x,*xb; 1879 PetscScalar *yarray,*zarray; 1880 const MatScalar *v; 1881 PetscErrorCode ierr; 1882 PetscInt mbs = a->mbs,i,j,n; 1883 const PetscInt *idx,*ii,*ridx = NULL; 1884 PetscBool usecprow=a->compressedrow.use; 1885 1886 PetscFunctionBegin; 1887 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1888 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1889 1890 idx = a->j; 1891 v = a->a; 1892 if (usecprow) { 1893 if (zz != yy) { 1894 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1895 } 1896 mbs = a->compressedrow.nrows; 1897 ii = a->compressedrow.i; 1898 ridx = a->compressedrow.rindex; 1899 } else { 1900 ii = a->i; 1901 y = yarray; 1902 z = zarray; 1903 } 1904 1905 for (i=0; i<mbs; i++) { 1906 n = ii[1] - ii[0]; ii++; 1907 if (usecprow) { 1908 z = zarray + 5*ridx[i]; 1909 y = yarray + 5*ridx[i]; 1910 } 1911 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1912 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1913 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1914 for (j=0; j<n; j++) { 1915 xb = x + 5*(*idx++); 1916 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1917 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1918 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1919 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1920 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1921 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1922 v += 25; 1923 } 1924 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1925 if (!usecprow) { 1926 z += 5; y += 5; 1927 } 1928 } 1929 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1930 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1931 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1932 PetscFunctionReturn(0); 1933 } 1934 #undef __FUNCT__ 1935 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1936 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1937 { 1938 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1939 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1940 const PetscScalar *x,*xb; 1941 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1942 const MatScalar *v; 1943 PetscErrorCode ierr; 1944 PetscInt mbs = a->mbs,i,j,n; 1945 const PetscInt *idx,*ii,*ridx=NULL; 1946 PetscBool usecprow=a->compressedrow.use; 1947 1948 PetscFunctionBegin; 1949 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1950 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1951 1952 idx = a->j; 1953 v = a->a; 1954 if (usecprow) { 1955 if (zz != yy) { 1956 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1957 } 1958 mbs = a->compressedrow.nrows; 1959 ii = a->compressedrow.i; 1960 ridx = a->compressedrow.rindex; 1961 } else { 1962 ii = a->i; 1963 y = yarray; 1964 z = zarray; 1965 } 1966 1967 for (i=0; i<mbs; i++) { 1968 n = ii[1] - ii[0]; ii++; 1969 if (usecprow) { 1970 z = zarray + 6*ridx[i]; 1971 y = yarray + 6*ridx[i]; 1972 } 1973 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1974 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1975 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1976 for (j=0; j<n; j++) { 1977 xb = x + 6*(*idx++); 1978 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1979 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1980 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1981 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1982 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1983 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1984 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1985 v += 36; 1986 } 1987 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1988 if (!usecprow) { 1989 z += 6; y += 6; 1990 } 1991 } 1992 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1993 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1994 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 1995 PetscFunctionReturn(0); 1996 } 1997 1998 #undef __FUNCT__ 1999 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 2000 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2001 { 2002 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2003 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2004 const PetscScalar *x,*xb; 2005 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2006 const MatScalar *v; 2007 PetscErrorCode ierr; 2008 PetscInt mbs = a->mbs,i,j,n; 2009 const PetscInt *idx,*ii,*ridx = NULL; 2010 PetscBool usecprow=a->compressedrow.use; 2011 2012 PetscFunctionBegin; 2013 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2014 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2015 2016 idx = a->j; 2017 v = a->a; 2018 if (usecprow) { 2019 if (zz != yy) { 2020 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2021 } 2022 mbs = a->compressedrow.nrows; 2023 ii = a->compressedrow.i; 2024 ridx = a->compressedrow.rindex; 2025 } else { 2026 ii = a->i; 2027 y = yarray; 2028 z = zarray; 2029 } 2030 2031 for (i=0; i<mbs; i++) { 2032 n = ii[1] - ii[0]; ii++; 2033 if (usecprow) { 2034 z = zarray + 7*ridx[i]; 2035 y = yarray + 7*ridx[i]; 2036 } 2037 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2038 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2039 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2040 for (j=0; j<n; j++) { 2041 xb = x + 7*(*idx++); 2042 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2043 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2044 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2045 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2046 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2047 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2048 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2049 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2050 v += 49; 2051 } 2052 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2053 if (!usecprow) { 2054 z += 7; y += 7; 2055 } 2056 } 2057 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2058 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2059 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 2060 PetscFunctionReturn(0); 2061 } 2062 2063 #undef __FUNCT__ 2064 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 2065 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2066 { 2067 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2068 PetscScalar *z = 0,*work,*workt,*zarray; 2069 const PetscScalar *x,*xb; 2070 const MatScalar *v; 2071 PetscErrorCode ierr; 2072 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2073 PetscInt ncols,k; 2074 const PetscInt *ridx = NULL,*idx,*ii; 2075 PetscBool usecprow = a->compressedrow.use; 2076 2077 PetscFunctionBegin; 2078 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2079 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2080 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2081 2082 idx = a->j; 2083 v = a->a; 2084 if (usecprow) { 2085 mbs = a->compressedrow.nrows; 2086 ii = a->compressedrow.i; 2087 ridx = a->compressedrow.rindex; 2088 } else { 2089 mbs = a->mbs; 2090 ii = a->i; 2091 z = zarray; 2092 } 2093 2094 if (!a->mult_work) { 2095 k = PetscMax(A->rmap->n,A->cmap->n); 2096 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2097 } 2098 work = a->mult_work; 2099 for (i=0; i<mbs; i++) { 2100 n = ii[1] - ii[0]; ii++; 2101 ncols = n*bs; 2102 workt = work; 2103 for (j=0; j<n; j++) { 2104 xb = x + bs*(*idx++); 2105 for (k=0; k<bs; k++) workt[k] = xb[k]; 2106 workt += bs; 2107 } 2108 if (usecprow) z = zarray + bs*ridx[i]; 2109 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2110 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 2111 v += n*bs2; 2112 if (!usecprow) z += bs; 2113 } 2114 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2115 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2116 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 2117 PetscFunctionReturn(0); 2118 } 2119 2120 #undef __FUNCT__ 2121 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 2122 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2123 { 2124 PetscScalar zero = 0.0; 2125 PetscErrorCode ierr; 2126 2127 PetscFunctionBegin; 2128 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2129 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2130 PetscFunctionReturn(0); 2131 } 2132 2133 #undef __FUNCT__ 2134 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 2135 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2136 { 2137 PetscScalar zero = 0.0; 2138 PetscErrorCode ierr; 2139 2140 PetscFunctionBegin; 2141 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2142 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2143 PetscFunctionReturn(0); 2144 } 2145 2146 #undef __FUNCT__ 2147 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 2148 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2149 { 2150 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2151 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2152 const PetscScalar *x,*xb = NULL; 2153 const MatScalar *v; 2154 PetscErrorCode ierr; 2155 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2156 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2157 Mat_CompressedRow cprow = a->compressedrow; 2158 PetscBool usecprow = cprow.use; 2159 2160 PetscFunctionBegin; 2161 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2162 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2163 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2164 2165 idx = a->j; 2166 v = a->a; 2167 if (usecprow) { 2168 mbs = cprow.nrows; 2169 ii = cprow.i; 2170 ridx = cprow.rindex; 2171 } else { 2172 mbs=a->mbs; 2173 ii = a->i; 2174 xb = x; 2175 } 2176 2177 switch (bs) { 2178 case 1: 2179 for (i=0; i<mbs; i++) { 2180 if (usecprow) xb = x + ridx[i]; 2181 x1 = xb[0]; 2182 ib = idx + ii[0]; 2183 n = ii[1] - ii[0]; ii++; 2184 for (j=0; j<n; j++) { 2185 rval = ib[j]; 2186 z[rval] += PetscConj(*v) * x1; 2187 v++; 2188 } 2189 if (!usecprow) xb++; 2190 } 2191 break; 2192 case 2: 2193 for (i=0; i<mbs; i++) { 2194 if (usecprow) xb = x + 2*ridx[i]; 2195 x1 = xb[0]; x2 = xb[1]; 2196 ib = idx + ii[0]; 2197 n = ii[1] - ii[0]; ii++; 2198 for (j=0; j<n; j++) { 2199 rval = ib[j]*2; 2200 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2201 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2202 v += 4; 2203 } 2204 if (!usecprow) xb += 2; 2205 } 2206 break; 2207 case 3: 2208 for (i=0; i<mbs; i++) { 2209 if (usecprow) xb = x + 3*ridx[i]; 2210 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2211 ib = idx + ii[0]; 2212 n = ii[1] - ii[0]; ii++; 2213 for (j=0; j<n; j++) { 2214 rval = ib[j]*3; 2215 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2216 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2217 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2218 v += 9; 2219 } 2220 if (!usecprow) xb += 3; 2221 } 2222 break; 2223 case 4: 2224 for (i=0; i<mbs; i++) { 2225 if (usecprow) xb = x + 4*ridx[i]; 2226 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2227 ib = idx + ii[0]; 2228 n = ii[1] - ii[0]; ii++; 2229 for (j=0; j<n; j++) { 2230 rval = ib[j]*4; 2231 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2232 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2233 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2234 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2235 v += 16; 2236 } 2237 if (!usecprow) xb += 4; 2238 } 2239 break; 2240 case 5: 2241 for (i=0; i<mbs; i++) { 2242 if (usecprow) xb = x + 5*ridx[i]; 2243 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2244 x4 = xb[3]; x5 = xb[4]; 2245 ib = idx + ii[0]; 2246 n = ii[1] - ii[0]; ii++; 2247 for (j=0; j<n; j++) { 2248 rval = ib[j]*5; 2249 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2250 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2251 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2252 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2253 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2254 v += 25; 2255 } 2256 if (!usecprow) xb += 5; 2257 } 2258 break; 2259 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 2260 PetscInt ncols,k; 2261 PetscScalar *work,*workt; 2262 const PetscScalar *xtmp; 2263 2264 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2265 if (!a->mult_work) { 2266 k = PetscMax(A->rmap->n,A->cmap->n); 2267 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2268 } 2269 work = a->mult_work; 2270 xtmp = x; 2271 for (i=0; i<mbs; i++) { 2272 n = ii[1] - ii[0]; ii++; 2273 ncols = n*bs; 2274 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2275 if (usecprow) xtmp = x + bs*ridx[i]; 2276 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2277 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2278 v += n*bs2; 2279 if (!usecprow) xtmp += bs; 2280 workt = work; 2281 for (j=0; j<n; j++) { 2282 zb = z + bs*(*idx++); 2283 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2284 workt += bs; 2285 } 2286 } 2287 } 2288 } 2289 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2290 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2291 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2292 PetscFunctionReturn(0); 2293 } 2294 2295 #undef __FUNCT__ 2296 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 2297 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2298 { 2299 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2300 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2301 const PetscScalar *x,*xb = 0; 2302 const MatScalar *v; 2303 PetscErrorCode ierr; 2304 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2305 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2306 Mat_CompressedRow cprow = a->compressedrow; 2307 PetscBool usecprow=cprow.use; 2308 2309 PetscFunctionBegin; 2310 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2311 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2312 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2313 2314 idx = a->j; 2315 v = a->a; 2316 if (usecprow) { 2317 mbs = cprow.nrows; 2318 ii = cprow.i; 2319 ridx = cprow.rindex; 2320 } else { 2321 mbs=a->mbs; 2322 ii = a->i; 2323 xb = x; 2324 } 2325 2326 switch (bs) { 2327 case 1: 2328 for (i=0; i<mbs; i++) { 2329 if (usecprow) xb = x + ridx[i]; 2330 x1 = xb[0]; 2331 ib = idx + ii[0]; 2332 n = ii[1] - ii[0]; ii++; 2333 for (j=0; j<n; j++) { 2334 rval = ib[j]; 2335 z[rval] += *v * x1; 2336 v++; 2337 } 2338 if (!usecprow) xb++; 2339 } 2340 break; 2341 case 2: 2342 for (i=0; i<mbs; i++) { 2343 if (usecprow) xb = x + 2*ridx[i]; 2344 x1 = xb[0]; x2 = xb[1]; 2345 ib = idx + ii[0]; 2346 n = ii[1] - ii[0]; ii++; 2347 for (j=0; j<n; j++) { 2348 rval = ib[j]*2; 2349 z[rval++] += v[0]*x1 + v[1]*x2; 2350 z[rval++] += v[2]*x1 + v[3]*x2; 2351 v += 4; 2352 } 2353 if (!usecprow) xb += 2; 2354 } 2355 break; 2356 case 3: 2357 for (i=0; i<mbs; i++) { 2358 if (usecprow) xb = x + 3*ridx[i]; 2359 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2360 ib = idx + ii[0]; 2361 n = ii[1] - ii[0]; ii++; 2362 for (j=0; j<n; j++) { 2363 rval = ib[j]*3; 2364 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2365 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2366 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2367 v += 9; 2368 } 2369 if (!usecprow) xb += 3; 2370 } 2371 break; 2372 case 4: 2373 for (i=0; i<mbs; i++) { 2374 if (usecprow) xb = x + 4*ridx[i]; 2375 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2376 ib = idx + ii[0]; 2377 n = ii[1] - ii[0]; ii++; 2378 for (j=0; j<n; j++) { 2379 rval = ib[j]*4; 2380 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2381 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2382 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2383 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2384 v += 16; 2385 } 2386 if (!usecprow) xb += 4; 2387 } 2388 break; 2389 case 5: 2390 for (i=0; i<mbs; i++) { 2391 if (usecprow) xb = x + 5*ridx[i]; 2392 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2393 x4 = xb[3]; x5 = xb[4]; 2394 ib = idx + ii[0]; 2395 n = ii[1] - ii[0]; ii++; 2396 for (j=0; j<n; j++) { 2397 rval = ib[j]*5; 2398 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2399 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2400 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2401 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2402 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2403 v += 25; 2404 } 2405 if (!usecprow) xb += 5; 2406 } 2407 break; 2408 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2409 PetscInt ncols,k; 2410 PetscScalar *work,*workt; 2411 const PetscScalar *xtmp; 2412 if (!a->mult_work) { 2413 k = PetscMax(A->rmap->n,A->cmap->n); 2414 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2415 } 2416 work = a->mult_work; 2417 xtmp = x; 2418 for (i=0; i<mbs; i++) { 2419 n = ii[1] - ii[0]; ii++; 2420 ncols = n*bs; 2421 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2422 if (usecprow) xtmp = x + bs*ridx[i]; 2423 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2424 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2425 v += n*bs2; 2426 if (!usecprow) xtmp += bs; 2427 workt = work; 2428 for (j=0; j<n; j++) { 2429 zb = z + bs*(*idx++); 2430 for (k=0; k<bs; k++) zb[k] += workt[k]; 2431 workt += bs; 2432 } 2433 } 2434 } 2435 } 2436 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2437 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2438 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2439 PetscFunctionReturn(0); 2440 } 2441 2442 #undef __FUNCT__ 2443 #define __FUNCT__ "MatScale_SeqBAIJ" 2444 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2445 { 2446 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2447 PetscInt totalnz = a->bs2*a->nz; 2448 PetscScalar oalpha = alpha; 2449 PetscErrorCode ierr; 2450 PetscBLASInt one = 1,tnz; 2451 2452 PetscFunctionBegin; 2453 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2454 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2455 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2456 PetscFunctionReturn(0); 2457 } 2458 2459 #undef __FUNCT__ 2460 #define __FUNCT__ "MatNorm_SeqBAIJ" 2461 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2462 { 2463 PetscErrorCode ierr; 2464 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2465 MatScalar *v = a->a; 2466 PetscReal sum = 0.0; 2467 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2468 2469 PetscFunctionBegin; 2470 if (type == NORM_FROBENIUS) { 2471 for (i=0; i< bs2*nz; i++) { 2472 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2473 } 2474 *norm = PetscSqrtReal(sum); 2475 } else if (type == NORM_1) { /* maximum column sum */ 2476 PetscReal *tmp; 2477 PetscInt *bcol = a->j; 2478 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2479 for (i=0; i<nz; i++) { 2480 for (j=0; j<bs; j++) { 2481 k1 = bs*(*bcol) + j; /* column index */ 2482 for (k=0; k<bs; k++) { 2483 tmp[k1] += PetscAbsScalar(*v); v++; 2484 } 2485 } 2486 bcol++; 2487 } 2488 *norm = 0.0; 2489 for (j=0; j<A->cmap->n; j++) { 2490 if (tmp[j] > *norm) *norm = tmp[j]; 2491 } 2492 ierr = PetscFree(tmp);CHKERRQ(ierr); 2493 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2494 *norm = 0.0; 2495 for (k=0; k<bs; k++) { 2496 for (j=0; j<a->mbs; j++) { 2497 v = a->a + bs2*a->i[j] + k; 2498 sum = 0.0; 2499 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2500 for (k1=0; k1<bs; k1++) { 2501 sum += PetscAbsScalar(*v); 2502 v += bs; 2503 } 2504 } 2505 if (sum > *norm) *norm = sum; 2506 } 2507 } 2508 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2509 PetscFunctionReturn(0); 2510 } 2511 2512 2513 #undef __FUNCT__ 2514 #define __FUNCT__ "MatEqual_SeqBAIJ" 2515 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2516 { 2517 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2518 PetscErrorCode ierr; 2519 2520 PetscFunctionBegin; 2521 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2522 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2523 *flg = PETSC_FALSE; 2524 PetscFunctionReturn(0); 2525 } 2526 2527 /* if the a->i are the same */ 2528 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2529 if (!*flg) PetscFunctionReturn(0); 2530 2531 /* if a->j are the same */ 2532 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2533 if (!*flg) PetscFunctionReturn(0); 2534 2535 /* if a->a are the same */ 2536 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2537 PetscFunctionReturn(0); 2538 2539 } 2540 2541 #undef __FUNCT__ 2542 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 2543 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2544 { 2545 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2546 PetscErrorCode ierr; 2547 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2548 PetscScalar *x,zero = 0.0; 2549 MatScalar *aa,*aa_j; 2550 2551 PetscFunctionBegin; 2552 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2553 bs = A->rmap->bs; 2554 aa = a->a; 2555 ai = a->i; 2556 aj = a->j; 2557 ambs = a->mbs; 2558 bs2 = a->bs2; 2559 2560 ierr = VecSet(v,zero);CHKERRQ(ierr); 2561 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2562 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2563 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2564 for (i=0; i<ambs; i++) { 2565 for (j=ai[i]; j<ai[i+1]; j++) { 2566 if (aj[j] == i) { 2567 row = i*bs; 2568 aa_j = aa+j*bs2; 2569 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2570 break; 2571 } 2572 } 2573 } 2574 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2575 PetscFunctionReturn(0); 2576 } 2577 2578 #undef __FUNCT__ 2579 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2580 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2581 { 2582 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2583 const PetscScalar *l,*r,*li,*ri; 2584 PetscScalar x; 2585 MatScalar *aa, *v; 2586 PetscErrorCode ierr; 2587 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2588 const PetscInt *ai,*aj; 2589 2590 PetscFunctionBegin; 2591 ai = a->i; 2592 aj = a->j; 2593 aa = a->a; 2594 m = A->rmap->n; 2595 n = A->cmap->n; 2596 bs = A->rmap->bs; 2597 mbs = a->mbs; 2598 bs2 = a->bs2; 2599 if (ll) { 2600 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2601 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2602 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2603 for (i=0; i<mbs; i++) { /* for each block row */ 2604 M = ai[i+1] - ai[i]; 2605 li = l + i*bs; 2606 v = aa + bs2*ai[i]; 2607 for (j=0; j<M; j++) { /* for each block */ 2608 for (k=0; k<bs2; k++) { 2609 (*v++) *= li[k%bs]; 2610 } 2611 } 2612 } 2613 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2614 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2615 } 2616 2617 if (rr) { 2618 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2619 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2620 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2621 for (i=0; i<mbs; i++) { /* for each block row */ 2622 iai = ai[i]; 2623 M = ai[i+1] - iai; 2624 v = aa + bs2*iai; 2625 for (j=0; j<M; j++) { /* for each block */ 2626 ri = r + bs*aj[iai+j]; 2627 for (k=0; k<bs; k++) { 2628 x = ri[k]; 2629 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2630 v += bs; 2631 } 2632 } 2633 } 2634 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2635 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2636 } 2637 PetscFunctionReturn(0); 2638 } 2639 2640 2641 #undef __FUNCT__ 2642 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2643 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2644 { 2645 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2646 2647 PetscFunctionBegin; 2648 info->block_size = a->bs2; 2649 info->nz_allocated = a->bs2*a->maxnz; 2650 info->nz_used = a->bs2*a->nz; 2651 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2652 info->assemblies = A->num_ass; 2653 info->mallocs = A->info.mallocs; 2654 info->memory = ((PetscObject)A)->mem; 2655 if (A->factortype) { 2656 info->fill_ratio_given = A->info.fill_ratio_given; 2657 info->fill_ratio_needed = A->info.fill_ratio_needed; 2658 info->factor_mallocs = A->info.factor_mallocs; 2659 } else { 2660 info->fill_ratio_given = 0; 2661 info->fill_ratio_needed = 0; 2662 info->factor_mallocs = 0; 2663 } 2664 PetscFunctionReturn(0); 2665 } 2666 2667 2668 #if defined(PETSC_THREADCOMM_ACTIVE) 2669 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A) 2670 { 2671 PetscErrorCode ierr; 2672 PetscInt *trstarts=A->rmap->trstarts; 2673 PetscInt n,start,end; 2674 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2675 2676 start = trstarts[thread_id]/A->rmap->bs; 2677 end = trstarts[thread_id+1]/A->rmap->bs; 2678 n = a->i[end] - a->i[start]; 2679 ierr = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 2680 return 0; 2681 } 2682 2683 #undef __FUNCT__ 2684 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2685 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2686 { 2687 PetscErrorCode ierr; 2688 2689 2690 PetscFunctionBegin; 2691 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr); 2692 PetscFunctionReturn(0); 2693 } 2694 #else 2695 #undef __FUNCT__ 2696 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2697 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2698 { 2699 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2700 PetscErrorCode ierr; 2701 2702 PetscFunctionBegin; 2703 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2704 PetscFunctionReturn(0); 2705 } 2706 #endif 2707