1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc-private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 #if defined(PETSC_THREADCOMM_ACTIVE) 7 #include <petscthreadcomm.h> 8 #endif 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 #undef __FUNCT__ 77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 79 { 80 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 81 PetscErrorCode ierr; 82 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 83 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 84 const PetscInt *irow,*icol; 85 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 86 PetscInt *aj = a->j,*ai = a->i; 87 MatScalar *mat_a; 88 Mat C; 89 PetscBool flag,sorted; 90 91 PetscFunctionBegin; 92 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 93 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 94 95 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 96 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 97 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 98 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 99 100 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 101 ssmap = smap; 102 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 103 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 104 /* determine lens of each row */ 105 for (i=0; i<nrows; i++) { 106 kstart = ai[irow[i]]; 107 kend = kstart + a->ilen[irow[i]]; 108 lens[i] = 0; 109 for (k=kstart; k<kend; k++) { 110 if (ssmap[aj[k]]) lens[i]++; 111 } 112 } 113 /* Create and fill new matrix */ 114 if (scall == MAT_REUSE_MATRIX) { 115 c = (Mat_SeqBAIJ*)((*B)->data); 116 117 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 118 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 119 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 120 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 121 C = *B; 122 } else { 123 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 124 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 125 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 126 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 127 } 128 c = (Mat_SeqBAIJ*)(C->data); 129 for (i=0; i<nrows; i++) { 130 row = irow[i]; 131 kstart = ai[row]; 132 kend = kstart + a->ilen[row]; 133 mat_i = c->i[i]; 134 mat_j = c->j + mat_i; 135 mat_a = c->a + mat_i*bs2; 136 mat_ilen = c->ilen + i; 137 for (k=kstart; k<kend; k++) { 138 if ((tcol=ssmap[a->j[k]])) { 139 *mat_j++ = tcol - 1; 140 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 141 mat_a += bs2; 142 (*mat_ilen)++; 143 } 144 } 145 } 146 147 /* Free work space */ 148 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 149 ierr = PetscFree(smap);CHKERRQ(ierr); 150 ierr = PetscFree(lens);CHKERRQ(ierr); 151 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 152 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 153 154 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 155 *B = C; 156 PetscFunctionReturn(0); 157 } 158 159 #undef __FUNCT__ 160 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 161 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 162 { 163 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 164 IS is1,is2; 165 PetscErrorCode ierr; 166 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 167 const PetscInt *irow,*icol; 168 169 PetscFunctionBegin; 170 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 171 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 172 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 173 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 174 175 /* Verify if the indices corespond to each element in a block 176 and form the IS with compressed IS */ 177 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 178 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 179 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 180 count = 0; 181 for (i=0; i<a->mbs; i++) { 182 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 183 if (vary[i]==bs) iary[count++] = i; 184 } 185 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 186 187 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 188 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 189 count = 0; 190 for (i=0; i<a->mbs; i++) { 191 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 192 if (vary[i]==bs) iary[count++] = i; 193 } 194 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 195 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 196 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 197 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 198 199 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 200 ierr = ISDestroy(&is1);CHKERRQ(ierr); 201 ierr = ISDestroy(&is2);CHKERRQ(ierr); 202 PetscFunctionReturn(0); 203 } 204 205 #undef __FUNCT__ 206 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 207 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 208 { 209 PetscErrorCode ierr; 210 PetscInt i; 211 212 PetscFunctionBegin; 213 if (scall == MAT_INITIAL_MATRIX) { 214 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 215 } 216 217 for (i=0; i<n; i++) { 218 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 219 } 220 PetscFunctionReturn(0); 221 } 222 223 224 /* -------------------------------------------------------*/ 225 /* Should check that shapes of vectors and matrices match */ 226 /* -------------------------------------------------------*/ 227 228 #if defined(PETSC_THREADCOMM_ACTIVE) 229 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 230 { 231 PetscErrorCode ierr; 232 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 233 PetscScalar *z; 234 const PetscScalar *x; 235 const MatScalar *aa; 236 PetscInt *trstarts=A->rmap->trstarts; 237 PetscInt n,start,end,i; 238 const PetscInt *aj,*ai; 239 PetscScalar sum; 240 241 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 242 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 243 start = trstarts[thread_id]; 244 end = trstarts[thread_id+1]; 245 ai = a->i; 246 for (i=start; i<end; i++) { 247 n = ai[i+1] - ai[i]; 248 aj = a->j + ai[i]; 249 aa = a->a + ai[i]; 250 sum = 0.0; 251 PetscSparseDensePlusDot(sum,x,aa,aj,n); 252 z[i] = sum; 253 } 254 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 255 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 256 return 0; 257 } 258 259 #undef __FUNCT__ 260 #define __FUNCT__ "MatMult_SeqBAIJ_1" 261 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 262 { 263 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 264 PetscScalar *z,sum; 265 const PetscScalar *x; 266 const MatScalar *v; 267 PetscErrorCode ierr; 268 PetscInt mbs,i,n; 269 const PetscInt *idx,*ii,*ridx=NULL; 270 PetscBool usecprow=a->compressedrow.use; 271 272 PetscFunctionBegin; 273 274 if (usecprow) { 275 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 276 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 277 mbs = a->compressedrow.nrows; 278 ii = a->compressedrow.i; 279 ridx = a->compressedrow.rindex; 280 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 281 for (i=0; i<mbs; i++) { 282 n = ii[i+1] - ii[i]; 283 v = a->a + ii[i]; 284 idx = a->j + ii[i]; 285 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 286 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 287 sum = 0.0; 288 PetscSparseDensePlusDot(sum,x,v,idx,n); 289 z[ridx[i]] = sum; 290 } 291 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 292 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 293 } else { 294 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr); 295 } 296 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 297 PetscFunctionReturn(0); 298 } 299 #else 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatMult_SeqBAIJ_1" 302 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 303 { 304 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 305 PetscScalar *z,sum; 306 const PetscScalar *x; 307 const MatScalar *v; 308 PetscErrorCode ierr; 309 PetscInt mbs,i,n; 310 const PetscInt *idx,*ii,*ridx=NULL; 311 PetscBool usecprow=a->compressedrow.use; 312 313 PetscFunctionBegin; 314 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 315 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 316 317 if (usecprow) { 318 mbs = a->compressedrow.nrows; 319 ii = a->compressedrow.i; 320 ridx = a->compressedrow.rindex; 321 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 322 } else { 323 mbs = a->mbs; 324 ii = a->i; 325 } 326 327 for (i=0; i<mbs; i++) { 328 n = ii[1] - ii[0]; 329 v = a->a + ii[0]; 330 idx = a->j + ii[0]; 331 ii++; 332 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 333 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 334 sum = 0.0; 335 PetscSparseDensePlusDot(sum,x,v,idx,n); 336 if (usecprow) { 337 z[ridx[i]] = sum; 338 } else { 339 z[i] = sum; 340 } 341 } 342 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 343 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 344 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 #endif 348 349 #if defined(PETSC_THREADCOMM_ACTIVE) 350 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 351 { 352 PetscErrorCode ierr; 353 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 354 PetscScalar *z,x1,x2,sum1,sum2; 355 const PetscScalar *x,*xb; 356 const MatScalar *aa; 357 PetscInt *trstarts=A->rmap->trstarts; 358 PetscInt n,start,end,i,j; 359 const PetscInt *aj,*ai; 360 361 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 362 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 363 start = trstarts[thread_id] / 2; 364 end = trstarts[thread_id+1] / 2; 365 ai = a->i; 366 for (i=start; i<end; i++) { 367 n = ai[i+1] - ai[i]; 368 aj = a->j + ai[i]; 369 aa = a->a + ai[i]*4; 370 sum1 = 0.0; sum2 = 0.0; 371 for (j=0; j<n; j++) { 372 xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1]; 373 sum1 += aa[4*j]*x1 + aa[4*j+2]*x2; 374 sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2; 375 } 376 z[2*i] = sum1; z[2*i+1] = sum2; 377 } 378 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 379 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 380 return 0; 381 } 382 383 #undef __FUNCT__ 384 #define __FUNCT__ "MatMult_SeqBAIJ_2" 385 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 386 { 387 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 388 PetscScalar *z,x1,x2,sum1,sum2; 389 const PetscScalar *x,*xb; 390 const MatScalar *v; 391 PetscErrorCode ierr; 392 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 393 PetscBool usecprow=a->compressedrow.use; 394 395 PetscFunctionBegin; 396 397 if (usecprow) { 398 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 399 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 400 mbs = a->compressedrow.nrows; 401 ii = a->compressedrow.i; 402 ridx = a->compressedrow.rindex; 403 for (i=0; i<mbs; i++) { 404 n = ii[i+1] - ii[i]; 405 idx = a->j + ii[i]; 406 v = a->a + ii[i]*4; 407 sum1 = 0.0; sum2 = 0.0; 408 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 409 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 410 for (j=0; j<n; j++) { 411 xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1]; 412 sum1 += v[4*j]*x1 + v[4*j+2]*x2; 413 sum2 += v[4*j+1]*x1 + v[4*j+3]*x2; 414 } 415 z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2; 416 } 417 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 418 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 419 } else { 420 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr); 421 } 422 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 423 PetscFunctionReturn(0); 424 } 425 #else 426 #undef __FUNCT__ 427 #define __FUNCT__ "MatMult_SeqBAIJ_2" 428 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 429 { 430 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 431 PetscScalar *z = 0,sum1,sum2,*zarray; 432 const PetscScalar *x,*xb; 433 PetscScalar x1,x2; 434 const MatScalar *v; 435 PetscErrorCode ierr; 436 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 437 PetscBool usecprow=a->compressedrow.use; 438 439 PetscFunctionBegin; 440 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 441 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 442 443 idx = a->j; 444 v = a->a; 445 if (usecprow) { 446 mbs = a->compressedrow.nrows; 447 ii = a->compressedrow.i; 448 ridx = a->compressedrow.rindex; 449 } else { 450 mbs = a->mbs; 451 ii = a->i; 452 z = zarray; 453 } 454 455 for (i=0; i<mbs; i++) { 456 n = ii[1] - ii[0]; ii++; 457 sum1 = 0.0; sum2 = 0.0; 458 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 459 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 460 for (j=0; j<n; j++) { 461 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 462 sum1 += v[0]*x1 + v[2]*x2; 463 sum2 += v[1]*x1 + v[3]*x2; 464 v += 4; 465 } 466 if (usecprow) z = zarray + 2*ridx[i]; 467 z[0] = sum1; z[1] = sum2; 468 if (!usecprow) z += 2; 469 } 470 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 471 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 472 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 473 PetscFunctionReturn(0); 474 } 475 #endif 476 477 #if defined(PETSC_THREADCOMM_ACTIVE) 478 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 479 { 480 PetscErrorCode ierr; 481 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 482 PetscScalar *z,x1,x2,x3,sum1,sum2,sum3; 483 const PetscScalar *x,*xb; 484 const MatScalar *aa; 485 PetscInt *trstarts=A->rmap->trstarts; 486 PetscInt n,start,end,i,j; 487 const PetscInt *aj,*ai; 488 489 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 490 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 491 start = trstarts[thread_id] / 3; 492 end = trstarts[thread_id+1] / 3; 493 ai = a->i; 494 for (i=start; i<end; i++) { 495 n = ai[i+1] - ai[i]; 496 aj = a->j + ai[i]; 497 aa = a->a + ai[i]*9; 498 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 499 for (j=0; j<n; j++) { 500 xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 501 sum1 += aa[9*j]*x1 + aa[9*j+3]*x2 + aa[9*j+6]*x3; 502 sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3; 503 sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3; 504 } 505 z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3; 506 } 507 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 508 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 509 return 0; 510 } 511 512 #undef __FUNCT__ 513 #define __FUNCT__ "MatMult_SeqBAIJ_3" 514 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 515 { 516 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 517 PetscScalar *z,sum1,sum2,sum3,x1,x2,x3; 518 const PetscScalar *x,*xb; 519 const MatScalar *v; 520 PetscErrorCode ierr; 521 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 522 PetscBool usecprow=a->compressedrow.use; 523 524 525 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 526 #pragma disjoint(*v,*z,*xb) 527 #endif 528 529 PetscFunctionBegin; 530 531 if (usecprow) { 532 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 533 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 534 mbs = a->compressedrow.nrows; 535 ii = a->compressedrow.i; 536 ridx = a->compressedrow.rindex; 537 for (i=0; i<mbs; i++) { 538 n = ii[i+1] - ii[i]; 539 idx = a->j + ii[i]; 540 v = a->a + ii[i]*9; 541 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 542 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 543 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 544 for (j=0; j<n; j++) { 545 xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 546 sum1 += v[9*j]*x1 + v[9*j+3]*x2 + v[9*j+6]*x3; 547 sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3; 548 sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3; 549 } 550 z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3; 551 } 552 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 553 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 554 } else { 555 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr); 556 } 557 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 558 PetscFunctionReturn(0); 559 } 560 #else 561 #undef __FUNCT__ 562 #define __FUNCT__ "MatMult_SeqBAIJ_3" 563 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 564 { 565 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 566 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 567 const PetscScalar *x,*xb; 568 const MatScalar *v; 569 PetscErrorCode ierr; 570 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 571 PetscBool usecprow=a->compressedrow.use; 572 573 574 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 575 #pragma disjoint(*v,*z,*xb) 576 #endif 577 578 PetscFunctionBegin; 579 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 580 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 581 582 idx = a->j; 583 v = a->a; 584 if (usecprow) { 585 mbs = a->compressedrow.nrows; 586 ii = a->compressedrow.i; 587 ridx = a->compressedrow.rindex; 588 } else { 589 mbs = a->mbs; 590 ii = a->i; 591 z = zarray; 592 } 593 594 for (i=0; i<mbs; i++) { 595 n = ii[1] - ii[0]; ii++; 596 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 597 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 598 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 599 for (j=0; j<n; j++) { 600 xb = x + 3*(*idx++); 601 x1 = xb[0]; 602 x2 = xb[1]; 603 x3 = xb[2]; 604 605 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 606 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 607 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 608 v += 9; 609 } 610 if (usecprow) z = zarray + 3*ridx[i]; 611 z[0] = sum1; z[1] = sum2; z[2] = sum3; 612 if (!usecprow) z += 3; 613 } 614 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 615 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 616 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 617 PetscFunctionReturn(0); 618 } 619 #endif 620 621 #if defined(PETSC_THREADCOMM_ACTIVE) 622 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 623 { 624 PetscErrorCode ierr; 625 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 626 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 627 const PetscScalar *x,*xb; 628 const MatScalar *aa; 629 PetscInt *trstarts=A->rmap->trstarts; 630 PetscInt n,start,end,i,j; 631 const PetscInt *aj,*ai; 632 633 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 634 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 635 start = trstarts[thread_id] / 4; 636 end = trstarts[thread_id+1] / 4; 637 ai = a->i; 638 for (i=start; i<end; i++) { 639 n = ai[i+1] - ai[i]; 640 aj = a->j + ai[i]; 641 aa = a->a + ai[i]*16; 642 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 643 for (j=0; j<n; j++) { 644 xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 645 sum1 += aa[16*j]*x1 + aa[16*j+4]*x2 + aa[16*j+8]*x3 + aa[16*j+12]*x4; 646 sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3 + aa[16*j+13]*x4; 647 sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4; 648 sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4; 649 } 650 z[4*i] = sum1; z[4*i+1] = sum2; 651 z[4*i+2] = sum3; z[4*i+3] = sum4; 652 } 653 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 654 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 655 return 0; 656 } 657 658 #undef __FUNCT__ 659 #define __FUNCT__ "MatMult_SeqBAIJ_4" 660 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 661 { 662 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 663 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 664 const PetscScalar *x,*xb; 665 const MatScalar *v; 666 PetscErrorCode ierr; 667 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 668 PetscBool usecprow=a->compressedrow.use; 669 670 PetscFunctionBegin; 671 672 if (usecprow) { 673 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 674 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 675 mbs = a->compressedrow.nrows; 676 ii = a->compressedrow.i; 677 ridx = a->compressedrow.rindex; 678 for (i=0; i<mbs; i++) { 679 n = ii[i+1] - ii[1]; 680 idx = a->j + ii[i]; 681 v = a->a + ii[i]*16; 682 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 683 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 684 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 685 for (j=0; j<n; j++) { 686 xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 687 sum1 += v[16*j]*x1 + v[16*j+4]*x2 + v[16*j+8]*x3 + v[16*j+12]*x4; 688 sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3 + v[16*j+13]*x4; 689 sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4; 690 sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4; 691 } 692 z[4*ridx[i]] = sum1; z[4*ridx[i]+1] = sum2; 693 z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4; 694 } 695 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 696 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 697 } else { 698 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr); 699 } 700 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 701 PetscFunctionReturn(0); 702 } 703 #else 704 #undef __FUNCT__ 705 #define __FUNCT__ "MatMult_SeqBAIJ_4" 706 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 707 { 708 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 709 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 710 const PetscScalar *x,*xb; 711 const MatScalar *v; 712 PetscErrorCode ierr; 713 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 714 PetscBool usecprow=a->compressedrow.use; 715 716 PetscFunctionBegin; 717 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 718 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 719 720 idx = a->j; 721 v = a->a; 722 if (usecprow) { 723 mbs = a->compressedrow.nrows; 724 ii = a->compressedrow.i; 725 ridx = a->compressedrow.rindex; 726 } else { 727 mbs = a->mbs; 728 ii = a->i; 729 z = zarray; 730 } 731 732 for (i=0; i<mbs; i++) { 733 n = ii[1] - ii[0]; 734 ii++; 735 sum1 = 0.0; 736 sum2 = 0.0; 737 sum3 = 0.0; 738 sum4 = 0.0; 739 740 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 741 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 742 for (j=0; j<n; j++) { 743 xb = x + 4*(*idx++); 744 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 745 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 746 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 747 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 748 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 749 v += 16; 750 } 751 if (usecprow) z = zarray + 4*ridx[i]; 752 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 753 if (!usecprow) z += 4; 754 } 755 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 756 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 757 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 758 PetscFunctionReturn(0); 759 } 760 #endif 761 762 #if defined(PETSC_THREADCOMM_ACTIVE) 763 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 764 { 765 PetscErrorCode ierr; 766 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 767 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 768 const PetscScalar *x,*xb; 769 const MatScalar *aa; 770 PetscInt *trstarts=A->rmap->trstarts; 771 PetscInt n,start,end,i,j; 772 const PetscInt *aj,*ai; 773 774 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 775 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 776 start = trstarts[thread_id] / 5; 777 end = trstarts[thread_id+1] / 5; 778 ai = a->i; 779 for (i=start; i<end; i++) { 780 n = ai[i+1] - ai[i]; 781 aj = a->j + ai[i]; 782 aa = a->a + ai[i]*25; 783 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 784 for (j=0; j<n; j++) { 785 xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 786 sum1 += aa[25*j]*x1 + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5; 787 sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5; 788 sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5; 789 sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5; 790 sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5; 791 } 792 z[5*i] = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3; 793 z[5*i+3] = sum4; z[5*i+4] = sum5; 794 } 795 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 796 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 797 return 0; 798 } 799 800 #undef __FUNCT__ 801 #define __FUNCT__ "MatMult_SeqBAIJ_5" 802 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 803 { 804 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 805 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 806 const PetscScalar *xb,*x; 807 const MatScalar *v; 808 PetscErrorCode ierr; 809 const PetscInt *idx,*ii,*ridx=NULL; 810 PetscInt mbs,i,j,n; 811 PetscBool usecprow=a->compressedrow.use; 812 813 PetscFunctionBegin; 814 815 if (usecprow) { 816 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 817 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 818 mbs = a->compressedrow.nrows; 819 ii = a->compressedrow.i; 820 ridx = a->compressedrow.rindex; 821 for (i=0; i<mbs; i++) { 822 n = ii[i+1] - ii[i]; 823 idx = a->j + ii[i]; 824 v = a->a + ii[i]*25; 825 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 826 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 827 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 828 for (j=0; j<n; j++) { 829 xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 830 sum1 += v[25*j]*x1 + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5; 831 sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5; 832 sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5; 833 sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5; 834 sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5; 835 } 836 z[5*ridx[i]] = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3; 837 z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5; 838 } 839 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 840 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 841 } else { 842 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr); 843 } 844 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 845 PetscFunctionReturn(0); 846 } 847 #else 848 #undef __FUNCT__ 849 #define __FUNCT__ "MatMult_SeqBAIJ_5" 850 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 851 { 852 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 853 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 854 const PetscScalar *xb,*x; 855 const MatScalar *v; 856 PetscErrorCode ierr; 857 const PetscInt *idx,*ii,*ridx=NULL; 858 PetscInt mbs,i,j,n; 859 PetscBool usecprow=a->compressedrow.use; 860 861 PetscFunctionBegin; 862 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 863 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 864 865 idx = a->j; 866 v = a->a; 867 if (usecprow) { 868 mbs = a->compressedrow.nrows; 869 ii = a->compressedrow.i; 870 ridx = a->compressedrow.rindex; 871 } else { 872 mbs = a->mbs; 873 ii = a->i; 874 z = zarray; 875 } 876 877 for (i=0; i<mbs; i++) { 878 n = ii[1] - ii[0]; ii++; 879 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 880 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 881 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 882 for (j=0; j<n; j++) { 883 xb = x + 5*(*idx++); 884 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 885 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 886 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 887 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 888 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 889 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 890 v += 25; 891 } 892 if (usecprow) z = zarray + 5*ridx[i]; 893 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 894 if (!usecprow) z += 5; 895 } 896 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 897 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 898 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 899 PetscFunctionReturn(0); 900 } 901 #endif 902 903 904 #if defined(PETSC_THREADCOMM_ACTIVE) 905 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 906 { 907 PetscErrorCode ierr; 908 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 909 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 910 const PetscScalar *x,*xb; 911 const MatScalar *aa; 912 PetscInt *trstarts=A->rmap->trstarts; 913 PetscInt n,start,end,i,j; 914 const PetscInt *aj,*ai; 915 916 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 917 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 918 start = trstarts[thread_id] / 6; 919 end = trstarts[thread_id+1] / 6; 920 ai = a->i; 921 for (i=start; i<end; i++) { 922 n = ai[i+1] - ai[i]; 923 aj = a->j + ai[i]; 924 aa = a->a + ai[i]*36; 925 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 926 for (j=0; j<n; j++) { 927 xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 928 sum1 += aa[36*j]*x1 + aa[36*j+6]*x2 + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6; 929 sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2 + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6; 930 sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2 + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6; 931 sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2 + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6; 932 sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6; 933 sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6; 934 } 935 z[6*i] = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3; 936 z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6; 937 } 938 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 939 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 940 return 0; 941 } 942 943 #undef __FUNCT__ 944 #define __FUNCT__ "MatMult_SeqBAIJ_6" 945 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 946 { 947 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 948 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 949 const PetscScalar *x,*xb; 950 const MatScalar *v; 951 PetscErrorCode ierr; 952 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 953 PetscBool usecprow=a->compressedrow.use; 954 955 PetscFunctionBegin; 956 957 if (usecprow) { 958 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 959 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 960 mbs = a->compressedrow.nrows; 961 ii = a->compressedrow.i; 962 ridx = a->compressedrow.rindex; 963 for (i=0; i<mbs; i++) { 964 n = ii[i+1] - ii[i]; 965 idx = a->j + ii[i]; 966 v = a->a + ii[i]*36; 967 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 968 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 969 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 970 for (j=0; j<n; j++) { 971 xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 972 sum1 += v[36*j]*x1 + v[36*j+6]*x2 + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6; 973 sum2 += v[36*j+1]*x1 + v[36*j+7]*x2 + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6; 974 sum3 += v[36*j+2]*x1 + v[36*j+8]*x2 + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6; 975 sum4 += v[36*j+3]*x1 + v[36*j+9]*x2 + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6; 976 sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6; 977 sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6; 978 } 979 z[6*ridx[i]] = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3; 980 z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6; 981 } 982 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 983 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 984 } else { 985 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr); 986 } 987 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 988 PetscFunctionReturn(0); 989 } 990 #else 991 #undef __FUNCT__ 992 #define __FUNCT__ "MatMult_SeqBAIJ_6" 993 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 994 { 995 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 996 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 997 const PetscScalar *x,*xb; 998 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 999 const MatScalar *v; 1000 PetscErrorCode ierr; 1001 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1002 PetscBool usecprow=a->compressedrow.use; 1003 1004 PetscFunctionBegin; 1005 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1006 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1007 1008 idx = a->j; 1009 v = a->a; 1010 if (usecprow) { 1011 mbs = a->compressedrow.nrows; 1012 ii = a->compressedrow.i; 1013 ridx = a->compressedrow.rindex; 1014 } else { 1015 mbs = a->mbs; 1016 ii = a->i; 1017 z = zarray; 1018 } 1019 1020 for (i=0; i<mbs; i++) { 1021 n = ii[1] - ii[0]; 1022 ii++; 1023 sum1 = 0.0; 1024 sum2 = 0.0; 1025 sum3 = 0.0; 1026 sum4 = 0.0; 1027 sum5 = 0.0; 1028 sum6 = 0.0; 1029 1030 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1031 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1032 for (j=0; j<n; j++) { 1033 xb = x + 6*(*idx++); 1034 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1035 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1036 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1037 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1038 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1039 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1040 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1041 v += 36; 1042 } 1043 if (usecprow) z = zarray + 6*ridx[i]; 1044 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1045 if (!usecprow) z += 6; 1046 } 1047 1048 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1049 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1050 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 1051 PetscFunctionReturn(0); 1052 } 1053 #endif 1054 1055 #if defined(PETSC_THREADCOMM_ACTIVE) 1056 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 1057 { 1058 PetscErrorCode ierr; 1059 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1060 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1061 const PetscScalar *x,*xb; 1062 const MatScalar *aa; 1063 PetscInt *trstarts=A->rmap->trstarts; 1064 PetscInt n,start,end,i,j; 1065 const PetscInt *aj,*ai; 1066 1067 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1068 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1069 start = trstarts[thread_id] / 7; 1070 end = trstarts[thread_id+1] / 7; 1071 ai = a->i; 1072 for (i=start; i<end; i++) { 1073 n = ai[i+1] - ai[i]; 1074 aj = a->j + ai[i]; 1075 aa = a->a + ai[i]*49; 1076 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1077 for (j=0; j<n; j++) { 1078 xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1079 sum1 += aa[49*j]*x1 + aa[49*j+7]*x2 + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7; 1080 sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2 + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7; 1081 sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2 + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7; 1082 sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7; 1083 sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7; 1084 sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7; 1085 sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7; 1086 } 1087 z[7*i] = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4; 1088 z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7; 1089 } 1090 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1091 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1092 return 0; 1093 } 1094 1095 #undef __FUNCT__ 1096 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1097 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1098 { 1099 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1100 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1101 const PetscScalar *x,*xb; 1102 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1103 const MatScalar *v; 1104 PetscErrorCode ierr; 1105 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1106 PetscBool usecprow=a->compressedrow.use; 1107 1108 PetscFunctionBegin; 1109 1110 if (usecprow) { 1111 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1112 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1113 mbs = a->compressedrow.nrows; 1114 ii = a->compressedrow.i; 1115 ridx = a->compressedrow.rindex; 1116 for (i=0; i<mbs; i++) { 1117 n = ii[i+1] - ii[i]; 1118 idx = a->j + ii[i]; 1119 v = a->a + ii[i]*49; 1120 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1121 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1122 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1123 for (j=0; j<n; j++) { 1124 xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1125 sum1 += v[49*j]*x1 + v[49*j+7]*x2 + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7; 1126 sum2 += v[49*j+1]*x1 + v[49*j+8]*x2 + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7; 1127 sum3 += v[49*j+2]*x1 + v[49*j+9]*x2 + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7; 1128 sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7; 1129 sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7; 1130 sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7; 1131 sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7; 1132 } 1133 z[7*ridx[i]] = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4; 1134 z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7; 1135 } 1136 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1137 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1138 } else { 1139 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr); 1140 } 1141 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1142 PetscFunctionReturn(0); 1143 } 1144 #else 1145 #undef __FUNCT__ 1146 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1147 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1148 { 1149 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1150 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1151 const PetscScalar *x,*xb; 1152 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1153 const MatScalar *v; 1154 PetscErrorCode ierr; 1155 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1156 PetscBool usecprow=a->compressedrow.use; 1157 1158 PetscFunctionBegin; 1159 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1160 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1161 1162 idx = a->j; 1163 v = a->a; 1164 if (usecprow) { 1165 mbs = a->compressedrow.nrows; 1166 ii = a->compressedrow.i; 1167 ridx = a->compressedrow.rindex; 1168 } else { 1169 mbs = a->mbs; 1170 ii = a->i; 1171 z = zarray; 1172 } 1173 1174 for (i=0; i<mbs; i++) { 1175 n = ii[1] - ii[0]; 1176 ii++; 1177 sum1 = 0.0; 1178 sum2 = 0.0; 1179 sum3 = 0.0; 1180 sum4 = 0.0; 1181 sum5 = 0.0; 1182 sum6 = 0.0; 1183 sum7 = 0.0; 1184 1185 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1186 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1187 for (j=0; j<n; j++) { 1188 xb = x + 7*(*idx++); 1189 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1190 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1191 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1192 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1193 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1194 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1195 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1196 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1197 v += 49; 1198 } 1199 if (usecprow) z = zarray + 7*ridx[i]; 1200 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1201 if (!usecprow) z += 7; 1202 } 1203 1204 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1205 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1206 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1207 PetscFunctionReturn(0); 1208 } 1209 #endif 1210 1211 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1212 /* Default MatMult for block size 15 */ 1213 1214 #undef __FUNCT__ 1215 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 1216 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1217 { 1218 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1219 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1220 const PetscScalar *x,*xb; 1221 PetscScalar *zarray,xv; 1222 const MatScalar *v; 1223 PetscErrorCode ierr; 1224 const PetscInt *ii,*ij=a->j,*idx; 1225 PetscInt mbs,i,j,k,n,*ridx=NULL; 1226 PetscBool usecprow=a->compressedrow.use; 1227 1228 PetscFunctionBegin; 1229 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1230 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1231 1232 v = a->a; 1233 if (usecprow) { 1234 mbs = a->compressedrow.nrows; 1235 ii = a->compressedrow.i; 1236 ridx = a->compressedrow.rindex; 1237 } else { 1238 mbs = a->mbs; 1239 ii = a->i; 1240 z = zarray; 1241 } 1242 1243 for (i=0; i<mbs; i++) { 1244 n = ii[i+1] - ii[i]; 1245 idx = ij + ii[i]; 1246 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1247 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1248 1249 for (j=0; j<n; j++) { 1250 xb = x + 15*(idx[j]); 1251 1252 for (k=0; k<15; k++) { 1253 xv = xb[k]; 1254 sum1 += v[0]*xv; 1255 sum2 += v[1]*xv; 1256 sum3 += v[2]*xv; 1257 sum4 += v[3]*xv; 1258 sum5 += v[4]*xv; 1259 sum6 += v[5]*xv; 1260 sum7 += v[6]*xv; 1261 sum8 += v[7]*xv; 1262 sum9 += v[8]*xv; 1263 sum10 += v[9]*xv; 1264 sum11 += v[10]*xv; 1265 sum12 += v[11]*xv; 1266 sum13 += v[12]*xv; 1267 sum14 += v[13]*xv; 1268 sum15 += v[14]*xv; 1269 v += 15; 1270 } 1271 } 1272 if (usecprow) z = zarray + 15*ridx[i]; 1273 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1274 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1275 1276 if (!usecprow) z += 15; 1277 } 1278 1279 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1280 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1281 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1282 PetscFunctionReturn(0); 1283 } 1284 1285 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1286 #undef __FUNCT__ 1287 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 1288 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1289 { 1290 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1291 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1292 const PetscScalar *x,*xb; 1293 PetscScalar x1,x2,x3,x4,*zarray; 1294 const MatScalar *v; 1295 PetscErrorCode ierr; 1296 const PetscInt *ii,*ij=a->j,*idx; 1297 PetscInt mbs,i,j,n,*ridx=NULL; 1298 PetscBool usecprow=a->compressedrow.use; 1299 1300 PetscFunctionBegin; 1301 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1302 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1303 1304 v = a->a; 1305 if (usecprow) { 1306 mbs = a->compressedrow.nrows; 1307 ii = a->compressedrow.i; 1308 ridx = a->compressedrow.rindex; 1309 } else { 1310 mbs = a->mbs; 1311 ii = a->i; 1312 z = zarray; 1313 } 1314 1315 for (i=0; i<mbs; i++) { 1316 n = ii[i+1] - ii[i]; 1317 idx = ij + ii[i]; 1318 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1319 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1320 1321 for (j=0; j<n; j++) { 1322 xb = x + 15*(idx[j]); 1323 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1324 1325 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1326 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1327 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1328 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1329 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1330 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1331 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1332 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1333 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1334 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1335 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1336 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1337 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1338 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1339 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1340 1341 v += 60; 1342 1343 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1344 1345 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1346 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1347 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1348 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1349 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1350 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1351 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1352 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1353 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1354 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1355 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1356 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1357 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1358 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1359 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1360 v += 60; 1361 1362 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1363 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1364 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1365 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1366 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1367 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1368 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1369 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1370 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1371 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1372 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1373 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1374 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1375 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1376 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1377 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1378 v += 60; 1379 1380 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1381 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1382 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1383 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1384 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1385 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1386 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1387 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1388 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1389 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1390 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1391 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1392 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1393 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1394 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1395 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1396 v += 45; 1397 } 1398 if (usecprow) z = zarray + 15*ridx[i]; 1399 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1400 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1401 1402 if (!usecprow) z += 15; 1403 } 1404 1405 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1406 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1407 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1408 PetscFunctionReturn(0); 1409 } 1410 1411 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1412 #undef __FUNCT__ 1413 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 1414 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1415 { 1416 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1417 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1418 const PetscScalar *x,*xb; 1419 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1420 const MatScalar *v; 1421 PetscErrorCode ierr; 1422 const PetscInt *ii,*ij=a->j,*idx; 1423 PetscInt mbs,i,j,n,*ridx=NULL; 1424 PetscBool usecprow=a->compressedrow.use; 1425 1426 PetscFunctionBegin; 1427 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1428 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1429 1430 v = a->a; 1431 if (usecprow) { 1432 mbs = a->compressedrow.nrows; 1433 ii = a->compressedrow.i; 1434 ridx = a->compressedrow.rindex; 1435 } else { 1436 mbs = a->mbs; 1437 ii = a->i; 1438 z = zarray; 1439 } 1440 1441 for (i=0; i<mbs; i++) { 1442 n = ii[i+1] - ii[i]; 1443 idx = ij + ii[i]; 1444 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1445 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1446 1447 for (j=0; j<n; j++) { 1448 xb = x + 15*(idx[j]); 1449 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1450 x8 = xb[7]; 1451 1452 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1453 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1454 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1455 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1456 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1457 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1458 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1459 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1460 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1461 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1462 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1463 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1464 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1465 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1466 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1467 v += 120; 1468 1469 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1470 1471 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1472 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1473 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1474 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1475 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1476 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1477 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1478 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1479 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1480 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1481 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1482 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1483 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1484 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1485 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1486 v += 105; 1487 } 1488 if (usecprow) z = zarray + 15*ridx[i]; 1489 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1490 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1491 1492 if (!usecprow) z += 15; 1493 } 1494 1495 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1496 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1497 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1498 PetscFunctionReturn(0); 1499 } 1500 1501 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1502 1503 #undef __FUNCT__ 1504 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 1505 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1506 { 1507 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1508 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1509 const PetscScalar *x,*xb; 1510 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1511 const MatScalar *v; 1512 PetscErrorCode ierr; 1513 const PetscInt *ii,*ij=a->j,*idx; 1514 PetscInt mbs,i,j,n,*ridx=NULL; 1515 PetscBool usecprow=a->compressedrow.use; 1516 1517 PetscFunctionBegin; 1518 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1519 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1520 1521 v = a->a; 1522 if (usecprow) { 1523 mbs = a->compressedrow.nrows; 1524 ii = a->compressedrow.i; 1525 ridx = a->compressedrow.rindex; 1526 } else { 1527 mbs = a->mbs; 1528 ii = a->i; 1529 z = zarray; 1530 } 1531 1532 for (i=0; i<mbs; i++) { 1533 n = ii[i+1] - ii[i]; 1534 idx = ij + ii[i]; 1535 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1536 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1537 1538 for (j=0; j<n; j++) { 1539 xb = x + 15*(idx[j]); 1540 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1541 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1542 1543 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1544 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1545 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1546 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1547 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1548 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1549 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1550 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1551 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1552 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1553 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1554 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1555 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1556 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1557 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1558 v += 225; 1559 } 1560 if (usecprow) z = zarray + 15*ridx[i]; 1561 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1562 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1563 1564 if (!usecprow) z += 15; 1565 } 1566 1567 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1568 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1569 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 1574 /* 1575 This will not work with MatScalar == float because it calls the BLAS 1576 */ 1577 #undef __FUNCT__ 1578 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1579 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1580 { 1581 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1582 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1583 MatScalar *v; 1584 PetscErrorCode ierr; 1585 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1586 PetscInt ncols,k,*ridx=NULL; 1587 PetscBool usecprow=a->compressedrow.use; 1588 1589 PetscFunctionBegin; 1590 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1591 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1592 1593 idx = a->j; 1594 v = a->a; 1595 if (usecprow) { 1596 mbs = a->compressedrow.nrows; 1597 ii = a->compressedrow.i; 1598 ridx = a->compressedrow.rindex; 1599 } else { 1600 mbs = a->mbs; 1601 ii = a->i; 1602 z = zarray; 1603 } 1604 1605 if (!a->mult_work) { 1606 k = PetscMax(A->rmap->n,A->cmap->n); 1607 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1608 } 1609 work = a->mult_work; 1610 for (i=0; i<mbs; i++) { 1611 n = ii[1] - ii[0]; ii++; 1612 ncols = n*bs; 1613 workt = work; 1614 for (j=0; j<n; j++) { 1615 xb = x + bs*(*idx++); 1616 for (k=0; k<bs; k++) workt[k] = xb[k]; 1617 workt += bs; 1618 } 1619 if (usecprow) z = zarray + bs*ridx[i]; 1620 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1621 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1622 v += n*bs2; 1623 if (!usecprow) z += bs; 1624 } 1625 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1626 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1627 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1628 PetscFunctionReturn(0); 1629 } 1630 1631 #undef __FUNCT__ 1632 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1633 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1634 { 1635 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1636 const PetscScalar *x; 1637 PetscScalar *y,*z,sum; 1638 const MatScalar *v; 1639 PetscErrorCode ierr; 1640 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1641 const PetscInt *idx,*ii; 1642 PetscBool usecprow=a->compressedrow.use; 1643 1644 PetscFunctionBegin; 1645 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1646 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1647 if (zz != yy) { 1648 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1649 } else { 1650 z = y; 1651 } 1652 1653 idx = a->j; 1654 v = a->a; 1655 if (usecprow) { 1656 if (zz != yy) { 1657 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1658 } 1659 mbs = a->compressedrow.nrows; 1660 ii = a->compressedrow.i; 1661 ridx = a->compressedrow.rindex; 1662 } else { 1663 ii = a->i; 1664 } 1665 1666 for (i=0; i<mbs; i++) { 1667 n = ii[1] - ii[0]; 1668 ii++; 1669 if (!usecprow) { 1670 sum = y[i]; 1671 } else { 1672 sum = y[ridx[i]]; 1673 } 1674 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1675 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1676 PetscSparseDensePlusDot(sum,x,v,idx,n); 1677 v += n; 1678 idx += n; 1679 if (usecprow) { 1680 z[ridx[i]] = sum; 1681 } else { 1682 z[i] = sum; 1683 } 1684 } 1685 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1686 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1687 if (zz != yy) { 1688 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1689 } 1690 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1691 PetscFunctionReturn(0); 1692 } 1693 1694 #undef __FUNCT__ 1695 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1696 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1697 { 1698 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1699 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1700 PetscScalar x1,x2,*yarray,*zarray; 1701 MatScalar *v; 1702 PetscErrorCode ierr; 1703 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1704 PetscBool usecprow=a->compressedrow.use; 1705 1706 PetscFunctionBegin; 1707 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1708 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1709 if (zz != yy) { 1710 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1711 } else { 1712 zarray = yarray; 1713 } 1714 1715 idx = a->j; 1716 v = a->a; 1717 if (usecprow) { 1718 if (zz != yy) { 1719 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1720 } 1721 mbs = a->compressedrow.nrows; 1722 ii = a->compressedrow.i; 1723 ridx = a->compressedrow.rindex; 1724 if (zz != yy) { 1725 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1726 } 1727 } else { 1728 ii = a->i; 1729 y = yarray; 1730 z = zarray; 1731 } 1732 1733 for (i=0; i<mbs; i++) { 1734 n = ii[1] - ii[0]; ii++; 1735 if (usecprow) { 1736 z = zarray + 2*ridx[i]; 1737 y = yarray + 2*ridx[i]; 1738 } 1739 sum1 = y[0]; sum2 = y[1]; 1740 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1741 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1742 for (j=0; j<n; j++) { 1743 xb = x + 2*(*idx++); 1744 x1 = xb[0]; 1745 x2 = xb[1]; 1746 1747 sum1 += v[0]*x1 + v[2]*x2; 1748 sum2 += v[1]*x1 + v[3]*x2; 1749 v += 4; 1750 } 1751 z[0] = sum1; z[1] = sum2; 1752 if (!usecprow) { 1753 z += 2; y += 2; 1754 } 1755 } 1756 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1757 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1758 if (zz != yy) { 1759 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1760 } 1761 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1762 PetscFunctionReturn(0); 1763 } 1764 1765 #undef __FUNCT__ 1766 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1767 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1768 { 1769 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1770 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1771 MatScalar *v; 1772 PetscErrorCode ierr; 1773 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1774 PetscBool usecprow=a->compressedrow.use; 1775 1776 PetscFunctionBegin; 1777 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1778 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1779 if (zz != yy) { 1780 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1781 } else { 1782 zarray = yarray; 1783 } 1784 1785 idx = a->j; 1786 v = a->a; 1787 if (usecprow) { 1788 if (zz != yy) { 1789 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1790 } 1791 mbs = a->compressedrow.nrows; 1792 ii = a->compressedrow.i; 1793 ridx = a->compressedrow.rindex; 1794 } else { 1795 ii = a->i; 1796 y = yarray; 1797 z = zarray; 1798 } 1799 1800 for (i=0; i<mbs; i++) { 1801 n = ii[1] - ii[0]; ii++; 1802 if (usecprow) { 1803 z = zarray + 3*ridx[i]; 1804 y = yarray + 3*ridx[i]; 1805 } 1806 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1807 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1808 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1809 for (j=0; j<n; j++) { 1810 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1811 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1812 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1813 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1814 v += 9; 1815 } 1816 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1817 if (!usecprow) { 1818 z += 3; y += 3; 1819 } 1820 } 1821 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1822 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1823 if (zz != yy) { 1824 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1825 } 1826 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1827 PetscFunctionReturn(0); 1828 } 1829 1830 #undef __FUNCT__ 1831 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1832 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1833 { 1834 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1835 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1836 MatScalar *v; 1837 PetscErrorCode ierr; 1838 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1839 PetscBool usecprow=a->compressedrow.use; 1840 1841 PetscFunctionBegin; 1842 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1843 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1844 if (zz != yy) { 1845 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1846 } else { 1847 zarray = yarray; 1848 } 1849 1850 idx = a->j; 1851 v = a->a; 1852 if (usecprow) { 1853 if (zz != yy) { 1854 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1855 } 1856 mbs = a->compressedrow.nrows; 1857 ii = a->compressedrow.i; 1858 ridx = a->compressedrow.rindex; 1859 } else { 1860 ii = a->i; 1861 y = yarray; 1862 z = zarray; 1863 } 1864 1865 for (i=0; i<mbs; i++) { 1866 n = ii[1] - ii[0]; ii++; 1867 if (usecprow) { 1868 z = zarray + 4*ridx[i]; 1869 y = yarray + 4*ridx[i]; 1870 } 1871 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1872 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1873 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1874 for (j=0; j<n; j++) { 1875 xb = x + 4*(*idx++); 1876 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1877 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1878 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1879 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1880 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1881 v += 16; 1882 } 1883 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1884 if (!usecprow) { 1885 z += 4; y += 4; 1886 } 1887 } 1888 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1889 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1890 if (zz != yy) { 1891 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1892 } 1893 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1894 PetscFunctionReturn(0); 1895 } 1896 1897 #undef __FUNCT__ 1898 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1899 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1900 { 1901 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1902 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1903 PetscScalar *yarray,*zarray; 1904 MatScalar *v; 1905 PetscErrorCode ierr; 1906 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1907 PetscBool usecprow=a->compressedrow.use; 1908 1909 PetscFunctionBegin; 1910 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1911 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1912 if (zz != yy) { 1913 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1914 } else { 1915 zarray = yarray; 1916 } 1917 1918 idx = a->j; 1919 v = a->a; 1920 if (usecprow) { 1921 if (zz != yy) { 1922 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1923 } 1924 mbs = a->compressedrow.nrows; 1925 ii = a->compressedrow.i; 1926 ridx = a->compressedrow.rindex; 1927 } else { 1928 ii = a->i; 1929 y = yarray; 1930 z = zarray; 1931 } 1932 1933 for (i=0; i<mbs; i++) { 1934 n = ii[1] - ii[0]; ii++; 1935 if (usecprow) { 1936 z = zarray + 5*ridx[i]; 1937 y = yarray + 5*ridx[i]; 1938 } 1939 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1940 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1941 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1942 for (j=0; j<n; j++) { 1943 xb = x + 5*(*idx++); 1944 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1945 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1946 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1947 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1948 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1949 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1950 v += 25; 1951 } 1952 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1953 if (!usecprow) { 1954 z += 5; y += 5; 1955 } 1956 } 1957 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1958 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1959 if (zz != yy) { 1960 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1961 } 1962 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1963 PetscFunctionReturn(0); 1964 } 1965 #undef __FUNCT__ 1966 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1967 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1968 { 1969 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1970 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1971 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1972 MatScalar *v; 1973 PetscErrorCode ierr; 1974 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1975 PetscBool usecprow=a->compressedrow.use; 1976 1977 PetscFunctionBegin; 1978 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1979 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1980 if (zz != yy) { 1981 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1982 } else { 1983 zarray = yarray; 1984 } 1985 1986 idx = a->j; 1987 v = a->a; 1988 if (usecprow) { 1989 if (zz != yy) { 1990 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1991 } 1992 mbs = a->compressedrow.nrows; 1993 ii = a->compressedrow.i; 1994 ridx = a->compressedrow.rindex; 1995 } else { 1996 ii = a->i; 1997 y = yarray; 1998 z = zarray; 1999 } 2000 2001 for (i=0; i<mbs; i++) { 2002 n = ii[1] - ii[0]; ii++; 2003 if (usecprow) { 2004 z = zarray + 6*ridx[i]; 2005 y = yarray + 6*ridx[i]; 2006 } 2007 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 2008 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2009 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2010 for (j=0; j<n; j++) { 2011 xb = x + 6*(*idx++); 2012 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 2013 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2014 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2015 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2016 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2017 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2018 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2019 v += 36; 2020 } 2021 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2022 if (!usecprow) { 2023 z += 6; y += 6; 2024 } 2025 } 2026 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2027 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 2028 if (zz != yy) { 2029 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2030 } 2031 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 2032 PetscFunctionReturn(0); 2033 } 2034 2035 #undef __FUNCT__ 2036 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 2037 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2038 { 2039 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2040 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2041 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2042 MatScalar *v; 2043 PetscErrorCode ierr; 2044 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 2045 PetscBool usecprow=a->compressedrow.use; 2046 2047 PetscFunctionBegin; 2048 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2049 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 2050 if (zz != yy) { 2051 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2052 } else { 2053 zarray = yarray; 2054 } 2055 2056 idx = a->j; 2057 v = a->a; 2058 if (usecprow) { 2059 if (zz != yy) { 2060 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2061 } 2062 mbs = a->compressedrow.nrows; 2063 ii = a->compressedrow.i; 2064 ridx = a->compressedrow.rindex; 2065 } else { 2066 ii = a->i; 2067 y = yarray; 2068 z = zarray; 2069 } 2070 2071 for (i=0; i<mbs; i++) { 2072 n = ii[1] - ii[0]; ii++; 2073 if (usecprow) { 2074 z = zarray + 7*ridx[i]; 2075 y = yarray + 7*ridx[i]; 2076 } 2077 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2078 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2079 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2080 for (j=0; j<n; j++) { 2081 xb = x + 7*(*idx++); 2082 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2083 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2084 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2085 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2086 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2087 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2088 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2089 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2090 v += 49; 2091 } 2092 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2093 if (!usecprow) { 2094 z += 7; y += 7; 2095 } 2096 } 2097 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2098 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 2099 if (zz != yy) { 2100 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2101 } 2102 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 2103 PetscFunctionReturn(0); 2104 } 2105 2106 #undef __FUNCT__ 2107 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 2108 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2109 { 2110 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2111 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 2112 MatScalar *v; 2113 PetscErrorCode ierr; 2114 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 2115 PetscInt ncols,k,*ridx=NULL; 2116 PetscBool usecprow=a->compressedrow.use; 2117 2118 PetscFunctionBegin; 2119 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2120 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2121 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2122 2123 idx = a->j; 2124 v = a->a; 2125 if (usecprow) { 2126 mbs = a->compressedrow.nrows; 2127 ii = a->compressedrow.i; 2128 ridx = a->compressedrow.rindex; 2129 } else { 2130 mbs = a->mbs; 2131 ii = a->i; 2132 z = zarray; 2133 } 2134 2135 if (!a->mult_work) { 2136 k = PetscMax(A->rmap->n,A->cmap->n); 2137 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2138 } 2139 work = a->mult_work; 2140 for (i=0; i<mbs; i++) { 2141 n = ii[1] - ii[0]; ii++; 2142 ncols = n*bs; 2143 workt = work; 2144 for (j=0; j<n; j++) { 2145 xb = x + bs*(*idx++); 2146 for (k=0; k<bs; k++) workt[k] = xb[k]; 2147 workt += bs; 2148 } 2149 if (usecprow) z = zarray + bs*ridx[i]; 2150 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2151 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 2152 v += n*bs2; 2153 if (!usecprow) z += bs; 2154 } 2155 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2156 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2157 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 2158 PetscFunctionReturn(0); 2159 } 2160 2161 #undef __FUNCT__ 2162 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 2163 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2164 { 2165 PetscScalar zero = 0.0; 2166 PetscErrorCode ierr; 2167 2168 PetscFunctionBegin; 2169 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2170 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2171 PetscFunctionReturn(0); 2172 } 2173 2174 #undef __FUNCT__ 2175 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 2176 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2177 { 2178 PetscScalar zero = 0.0; 2179 PetscErrorCode ierr; 2180 2181 PetscFunctionBegin; 2182 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2183 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2184 PetscFunctionReturn(0); 2185 } 2186 2187 #undef __FUNCT__ 2188 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 2189 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2190 { 2191 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2192 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 2193 MatScalar *v; 2194 PetscErrorCode ierr; 2195 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL; 2196 Mat_CompressedRow cprow = a->compressedrow; 2197 PetscBool usecprow=cprow.use; 2198 2199 PetscFunctionBegin; 2200 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2201 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2202 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2203 2204 idx = a->j; 2205 v = a->a; 2206 if (usecprow) { 2207 mbs = cprow.nrows; 2208 ii = cprow.i; 2209 ridx = cprow.rindex; 2210 } else { 2211 mbs=a->mbs; 2212 ii = a->i; 2213 xb = x; 2214 } 2215 2216 switch (bs) { 2217 case 1: 2218 for (i=0; i<mbs; i++) { 2219 if (usecprow) xb = x + ridx[i]; 2220 x1 = xb[0]; 2221 ib = idx + ii[0]; 2222 n = ii[1] - ii[0]; ii++; 2223 for (j=0; j<n; j++) { 2224 rval = ib[j]; 2225 z[rval] += PetscConj(*v) * x1; 2226 v++; 2227 } 2228 if (!usecprow) xb++; 2229 } 2230 break; 2231 case 2: 2232 for (i=0; i<mbs; i++) { 2233 if (usecprow) xb = x + 2*ridx[i]; 2234 x1 = xb[0]; x2 = xb[1]; 2235 ib = idx + ii[0]; 2236 n = ii[1] - ii[0]; ii++; 2237 for (j=0; j<n; j++) { 2238 rval = ib[j]*2; 2239 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2240 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2241 v += 4; 2242 } 2243 if (!usecprow) xb += 2; 2244 } 2245 break; 2246 case 3: 2247 for (i=0; i<mbs; i++) { 2248 if (usecprow) xb = x + 3*ridx[i]; 2249 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2250 ib = idx + ii[0]; 2251 n = ii[1] - ii[0]; ii++; 2252 for (j=0; j<n; j++) { 2253 rval = ib[j]*3; 2254 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2255 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2256 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2257 v += 9; 2258 } 2259 if (!usecprow) xb += 3; 2260 } 2261 break; 2262 case 4: 2263 for (i=0; i<mbs; i++) { 2264 if (usecprow) xb = x + 4*ridx[i]; 2265 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2266 ib = idx + ii[0]; 2267 n = ii[1] - ii[0]; ii++; 2268 for (j=0; j<n; j++) { 2269 rval = ib[j]*4; 2270 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2271 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2272 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2273 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2274 v += 16; 2275 } 2276 if (!usecprow) xb += 4; 2277 } 2278 break; 2279 case 5: 2280 for (i=0; i<mbs; i++) { 2281 if (usecprow) xb = x + 5*ridx[i]; 2282 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2283 x4 = xb[3]; x5 = xb[4]; 2284 ib = idx + ii[0]; 2285 n = ii[1] - ii[0]; ii++; 2286 for (j=0; j<n; j++) { 2287 rval = ib[j]*5; 2288 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2289 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2290 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2291 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2292 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2293 v += 25; 2294 } 2295 if (!usecprow) xb += 5; 2296 } 2297 break; 2298 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 2299 PetscInt ncols,k; 2300 PetscScalar *work,*workt,*xtmp; 2301 2302 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2303 if (!a->mult_work) { 2304 k = PetscMax(A->rmap->n,A->cmap->n); 2305 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2306 } 2307 work = a->mult_work; 2308 xtmp = x; 2309 for (i=0; i<mbs; i++) { 2310 n = ii[1] - ii[0]; ii++; 2311 ncols = n*bs; 2312 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2313 if (usecprow) xtmp = x + bs*ridx[i]; 2314 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2315 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2316 v += n*bs2; 2317 if (!usecprow) xtmp += bs; 2318 workt = work; 2319 for (j=0; j<n; j++) { 2320 zb = z + bs*(*idx++); 2321 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2322 workt += bs; 2323 } 2324 } 2325 } 2326 } 2327 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2328 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2329 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2330 PetscFunctionReturn(0); 2331 } 2332 2333 #undef __FUNCT__ 2334 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 2335 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2336 { 2337 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2338 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 2339 MatScalar *v; 2340 PetscErrorCode ierr; 2341 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL; 2342 Mat_CompressedRow cprow = a->compressedrow; 2343 PetscBool usecprow=cprow.use; 2344 2345 PetscFunctionBegin; 2346 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2347 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2348 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2349 2350 idx = a->j; 2351 v = a->a; 2352 if (usecprow) { 2353 mbs = cprow.nrows; 2354 ii = cprow.i; 2355 ridx = cprow.rindex; 2356 } else { 2357 mbs=a->mbs; 2358 ii = a->i; 2359 xb = x; 2360 } 2361 2362 switch (bs) { 2363 case 1: 2364 for (i=0; i<mbs; i++) { 2365 if (usecprow) xb = x + ridx[i]; 2366 x1 = xb[0]; 2367 ib = idx + ii[0]; 2368 n = ii[1] - ii[0]; ii++; 2369 for (j=0; j<n; j++) { 2370 rval = ib[j]; 2371 z[rval] += *v * x1; 2372 v++; 2373 } 2374 if (!usecprow) xb++; 2375 } 2376 break; 2377 case 2: 2378 for (i=0; i<mbs; i++) { 2379 if (usecprow) xb = x + 2*ridx[i]; 2380 x1 = xb[0]; x2 = xb[1]; 2381 ib = idx + ii[0]; 2382 n = ii[1] - ii[0]; ii++; 2383 for (j=0; j<n; j++) { 2384 rval = ib[j]*2; 2385 z[rval++] += v[0]*x1 + v[1]*x2; 2386 z[rval++] += v[2]*x1 + v[3]*x2; 2387 v += 4; 2388 } 2389 if (!usecprow) xb += 2; 2390 } 2391 break; 2392 case 3: 2393 for (i=0; i<mbs; i++) { 2394 if (usecprow) xb = x + 3*ridx[i]; 2395 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2396 ib = idx + ii[0]; 2397 n = ii[1] - ii[0]; ii++; 2398 for (j=0; j<n; j++) { 2399 rval = ib[j]*3; 2400 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2401 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2402 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2403 v += 9; 2404 } 2405 if (!usecprow) xb += 3; 2406 } 2407 break; 2408 case 4: 2409 for (i=0; i<mbs; i++) { 2410 if (usecprow) xb = x + 4*ridx[i]; 2411 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2412 ib = idx + ii[0]; 2413 n = ii[1] - ii[0]; ii++; 2414 for (j=0; j<n; j++) { 2415 rval = ib[j]*4; 2416 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2417 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2418 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2419 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2420 v += 16; 2421 } 2422 if (!usecprow) xb += 4; 2423 } 2424 break; 2425 case 5: 2426 for (i=0; i<mbs; i++) { 2427 if (usecprow) xb = x + 5*ridx[i]; 2428 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2429 x4 = xb[3]; x5 = xb[4]; 2430 ib = idx + ii[0]; 2431 n = ii[1] - ii[0]; ii++; 2432 for (j=0; j<n; j++) { 2433 rval = ib[j]*5; 2434 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2435 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2436 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2437 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2438 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2439 v += 25; 2440 } 2441 if (!usecprow) xb += 5; 2442 } 2443 break; 2444 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2445 PetscInt ncols,k; 2446 PetscScalar *work,*workt,*xtmp; 2447 2448 if (!a->mult_work) { 2449 k = PetscMax(A->rmap->n,A->cmap->n); 2450 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2451 } 2452 work = a->mult_work; 2453 xtmp = x; 2454 for (i=0; i<mbs; i++) { 2455 n = ii[1] - ii[0]; ii++; 2456 ncols = n*bs; 2457 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2458 if (usecprow) xtmp = x + bs*ridx[i]; 2459 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2460 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2461 v += n*bs2; 2462 if (!usecprow) xtmp += bs; 2463 workt = work; 2464 for (j=0; j<n; j++) { 2465 zb = z + bs*(*idx++); 2466 for (k=0; k<bs; k++) zb[k] += workt[k]; 2467 workt += bs; 2468 } 2469 } 2470 } 2471 } 2472 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2473 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2474 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2475 PetscFunctionReturn(0); 2476 } 2477 2478 #undef __FUNCT__ 2479 #define __FUNCT__ "MatScale_SeqBAIJ" 2480 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2481 { 2482 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2483 PetscInt totalnz = a->bs2*a->nz; 2484 PetscScalar oalpha = alpha; 2485 PetscErrorCode ierr; 2486 PetscBLASInt one = 1,tnz; 2487 2488 PetscFunctionBegin; 2489 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2490 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2491 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2492 PetscFunctionReturn(0); 2493 } 2494 2495 #undef __FUNCT__ 2496 #define __FUNCT__ "MatNorm_SeqBAIJ" 2497 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2498 { 2499 PetscErrorCode ierr; 2500 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2501 MatScalar *v = a->a; 2502 PetscReal sum = 0.0; 2503 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2504 2505 PetscFunctionBegin; 2506 if (type == NORM_FROBENIUS) { 2507 for (i=0; i< bs2*nz; i++) { 2508 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2509 } 2510 *norm = PetscSqrtReal(sum); 2511 } else if (type == NORM_1) { /* maximum column sum */ 2512 PetscReal *tmp; 2513 PetscInt *bcol = a->j; 2514 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2515 for (i=0; i<nz; i++) { 2516 for (j=0; j<bs; j++) { 2517 k1 = bs*(*bcol) + j; /* column index */ 2518 for (k=0; k<bs; k++) { 2519 tmp[k1] += PetscAbsScalar(*v); v++; 2520 } 2521 } 2522 bcol++; 2523 } 2524 *norm = 0.0; 2525 for (j=0; j<A->cmap->n; j++) { 2526 if (tmp[j] > *norm) *norm = tmp[j]; 2527 } 2528 ierr = PetscFree(tmp);CHKERRQ(ierr); 2529 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2530 *norm = 0.0; 2531 for (k=0; k<bs; k++) { 2532 for (j=0; j<a->mbs; j++) { 2533 v = a->a + bs2*a->i[j] + k; 2534 sum = 0.0; 2535 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2536 for (k1=0; k1<bs; k1++) { 2537 sum += PetscAbsScalar(*v); 2538 v += bs; 2539 } 2540 } 2541 if (sum > *norm) *norm = sum; 2542 } 2543 } 2544 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2545 PetscFunctionReturn(0); 2546 } 2547 2548 2549 #undef __FUNCT__ 2550 #define __FUNCT__ "MatEqual_SeqBAIJ" 2551 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2552 { 2553 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2554 PetscErrorCode ierr; 2555 2556 PetscFunctionBegin; 2557 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2558 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2559 *flg = PETSC_FALSE; 2560 PetscFunctionReturn(0); 2561 } 2562 2563 /* if the a->i are the same */ 2564 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2565 if (!*flg) PetscFunctionReturn(0); 2566 2567 /* if a->j are the same */ 2568 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2569 if (!*flg) PetscFunctionReturn(0); 2570 2571 /* if a->a are the same */ 2572 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2573 PetscFunctionReturn(0); 2574 2575 } 2576 2577 #undef __FUNCT__ 2578 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 2579 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2580 { 2581 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2582 PetscErrorCode ierr; 2583 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2584 PetscScalar *x,zero = 0.0; 2585 MatScalar *aa,*aa_j; 2586 2587 PetscFunctionBegin; 2588 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2589 bs = A->rmap->bs; 2590 aa = a->a; 2591 ai = a->i; 2592 aj = a->j; 2593 ambs = a->mbs; 2594 bs2 = a->bs2; 2595 2596 ierr = VecSet(v,zero);CHKERRQ(ierr); 2597 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2598 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2599 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2600 for (i=0; i<ambs; i++) { 2601 for (j=ai[i]; j<ai[i+1]; j++) { 2602 if (aj[j] == i) { 2603 row = i*bs; 2604 aa_j = aa+j*bs2; 2605 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2606 break; 2607 } 2608 } 2609 } 2610 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2611 PetscFunctionReturn(0); 2612 } 2613 2614 #undef __FUNCT__ 2615 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2616 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2617 { 2618 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2619 const PetscScalar *l,*r,*li,*ri; 2620 PetscScalar x; 2621 MatScalar *aa, *v; 2622 PetscErrorCode ierr; 2623 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2624 const PetscInt *ai,*aj; 2625 2626 PetscFunctionBegin; 2627 ai = a->i; 2628 aj = a->j; 2629 aa = a->a; 2630 m = A->rmap->n; 2631 n = A->cmap->n; 2632 bs = A->rmap->bs; 2633 mbs = a->mbs; 2634 bs2 = a->bs2; 2635 if (ll) { 2636 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2637 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2638 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2639 for (i=0; i<mbs; i++) { /* for each block row */ 2640 M = ai[i+1] - ai[i]; 2641 li = l + i*bs; 2642 v = aa + bs2*ai[i]; 2643 for (j=0; j<M; j++) { /* for each block */ 2644 for (k=0; k<bs2; k++) { 2645 (*v++) *= li[k%bs]; 2646 } 2647 } 2648 } 2649 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2650 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2651 } 2652 2653 if (rr) { 2654 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2655 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2656 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2657 for (i=0; i<mbs; i++) { /* for each block row */ 2658 iai = ai[i]; 2659 M = ai[i+1] - iai; 2660 v = aa + bs2*iai; 2661 for (j=0; j<M; j++) { /* for each block */ 2662 ri = r + bs*aj[iai+j]; 2663 for (k=0; k<bs; k++) { 2664 x = ri[k]; 2665 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2666 v += bs; 2667 } 2668 } 2669 } 2670 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2671 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2672 } 2673 PetscFunctionReturn(0); 2674 } 2675 2676 2677 #undef __FUNCT__ 2678 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2679 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2680 { 2681 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2682 2683 PetscFunctionBegin; 2684 info->block_size = a->bs2; 2685 info->nz_allocated = a->bs2*a->maxnz; 2686 info->nz_used = a->bs2*a->nz; 2687 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2688 info->assemblies = A->num_ass; 2689 info->mallocs = A->info.mallocs; 2690 info->memory = ((PetscObject)A)->mem; 2691 if (A->factortype) { 2692 info->fill_ratio_given = A->info.fill_ratio_given; 2693 info->fill_ratio_needed = A->info.fill_ratio_needed; 2694 info->factor_mallocs = A->info.factor_mallocs; 2695 } else { 2696 info->fill_ratio_given = 0; 2697 info->fill_ratio_needed = 0; 2698 info->factor_mallocs = 0; 2699 } 2700 PetscFunctionReturn(0); 2701 } 2702 2703 2704 #if defined(PETSC_THREADCOMM_ACTIVE) 2705 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A) 2706 { 2707 PetscErrorCode ierr; 2708 PetscInt *trstarts=A->rmap->trstarts; 2709 PetscInt n,start,end; 2710 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2711 2712 start = trstarts[thread_id]/A->rmap->bs; 2713 end = trstarts[thread_id+1]/A->rmap->bs; 2714 n = a->i[end] - a->i[start]; 2715 ierr = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 2716 return 0; 2717 } 2718 2719 #undef __FUNCT__ 2720 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2721 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2722 { 2723 PetscErrorCode ierr; 2724 2725 2726 PetscFunctionBegin; 2727 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr); 2728 PetscFunctionReturn(0); 2729 } 2730 #else 2731 #undef __FUNCT__ 2732 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2733 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2734 { 2735 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2736 PetscErrorCode ierr; 2737 2738 PetscFunctionBegin; 2739 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2740 PetscFunctionReturn(0); 2741 } 2742 #endif 2743