1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc-private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 #if defined(PETSC_THREADCOMM_ACTIVE) 7 #include <petscthreadcomm.h> 8 #endif 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 #undef __FUNCT__ 77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 79 { 80 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 81 PetscErrorCode ierr; 82 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 83 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 84 const PetscInt *irow,*icol; 85 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 86 PetscInt *aj = a->j,*ai = a->i; 87 MatScalar *mat_a; 88 Mat C; 89 PetscBool flag; 90 91 PetscFunctionBegin; 92 93 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 94 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 95 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 96 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 97 98 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr); 99 ssmap = smap; 100 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr); 101 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 102 /* determine lens of each row */ 103 for (i=0; i<nrows; i++) { 104 kstart = ai[irow[i]]; 105 kend = kstart + a->ilen[irow[i]]; 106 lens[i] = 0; 107 for (k=kstart; k<kend; k++) { 108 if (ssmap[aj[k]]) lens[i]++; 109 } 110 } 111 /* Create and fill new matrix */ 112 if (scall == MAT_REUSE_MATRIX) { 113 c = (Mat_SeqBAIJ*)((*B)->data); 114 115 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 116 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 117 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 118 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 119 C = *B; 120 } else { 121 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 122 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 123 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 124 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 125 } 126 c = (Mat_SeqBAIJ*)(C->data); 127 for (i=0; i<nrows; i++) { 128 row = irow[i]; 129 kstart = ai[row]; 130 kend = kstart + a->ilen[row]; 131 mat_i = c->i[i]; 132 mat_j = c->j + mat_i; 133 mat_a = c->a + mat_i*bs2; 134 mat_ilen = c->ilen + i; 135 for (k=kstart; k<kend; k++) { 136 if ((tcol=ssmap[a->j[k]])) { 137 *mat_j++ = tcol - 1; 138 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 139 mat_a += bs2; 140 (*mat_ilen)++; 141 } 142 } 143 } 144 /* sort */ 145 { 146 MatScalar *work; 147 148 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 149 for (i=0; i<nrows; i++) { 150 PetscInt ilen; 151 mat_i = c->i[i]; 152 mat_j = c->j + mat_i; 153 mat_a = c->a + mat_i*bs2; 154 ilen = c->ilen[i]; 155 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 156 } 157 ierr = PetscFree(work);CHKERRQ(ierr); 158 } 159 160 /* Free work space */ 161 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 162 ierr = PetscFree(smap);CHKERRQ(ierr); 163 ierr = PetscFree(lens);CHKERRQ(ierr); 164 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 165 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 166 167 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 168 *B = C; 169 PetscFunctionReturn(0); 170 } 171 172 #undef __FUNCT__ 173 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 174 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 175 { 176 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 177 IS is1,is2; 178 PetscErrorCode ierr; 179 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 180 const PetscInt *irow,*icol; 181 182 PetscFunctionBegin; 183 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 184 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 185 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 186 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 187 188 /* Verify if the indices corespond to each element in a block 189 and form the IS with compressed IS */ 190 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 191 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 192 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 193 for (i=0; i<a->mbs; i++) { 194 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 195 } 196 count = 0; 197 for (i=0; i<nrows; i++) { 198 PetscInt j = irow[i] / bs; 199 if ((vary[j]--)==bs) iary[count++] = j; 200 } 201 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 202 203 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 204 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 205 for (i=0; i<a->mbs; i++) { 206 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 207 } 208 count = 0; 209 for (i=0; i<ncols; i++) { 210 PetscInt j = icol[i] / bs; 211 if ((vary[j]--)==bs) iary[count++] = j; 212 } 213 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 214 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 215 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 216 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 217 218 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 219 ierr = ISDestroy(&is1);CHKERRQ(ierr); 220 ierr = ISDestroy(&is2);CHKERRQ(ierr); 221 PetscFunctionReturn(0); 222 } 223 224 #undef __FUNCT__ 225 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 226 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 227 { 228 PetscErrorCode ierr; 229 PetscInt i; 230 231 PetscFunctionBegin; 232 if (scall == MAT_INITIAL_MATRIX) { 233 ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr); 234 } 235 236 for (i=0; i<n; i++) { 237 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 238 } 239 PetscFunctionReturn(0); 240 } 241 242 243 /* -------------------------------------------------------*/ 244 /* Should check that shapes of vectors and matrices match */ 245 /* -------------------------------------------------------*/ 246 247 #if defined(PETSC_THREADCOMM_ACTIVE) 248 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 249 { 250 PetscErrorCode ierr; 251 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 252 PetscScalar *z; 253 const PetscScalar *x; 254 const MatScalar *aa; 255 PetscInt *trstarts=A->rmap->trstarts; 256 PetscInt n,start,end,i; 257 const PetscInt *aj,*ai; 258 PetscScalar sum; 259 260 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 261 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 262 start = trstarts[thread_id]; 263 end = trstarts[thread_id+1]; 264 ai = a->i; 265 for (i=start; i<end; i++) { 266 n = ai[i+1] - ai[i]; 267 aj = a->j + ai[i]; 268 aa = a->a + ai[i]; 269 sum = 0.0; 270 PetscSparseDensePlusDot(sum,x,aa,aj,n); 271 z[i] = sum; 272 } 273 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 274 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 275 return 0; 276 } 277 278 #undef __FUNCT__ 279 #define __FUNCT__ "MatMult_SeqBAIJ_1" 280 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 281 { 282 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 283 PetscScalar *z,sum; 284 const PetscScalar *x; 285 const MatScalar *v; 286 PetscErrorCode ierr; 287 PetscInt mbs,i,n; 288 const PetscInt *idx,*ii,*ridx=NULL; 289 PetscBool usecprow=a->compressedrow.use; 290 291 PetscFunctionBegin; 292 293 if (usecprow) { 294 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 295 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 296 mbs = a->compressedrow.nrows; 297 ii = a->compressedrow.i; 298 ridx = a->compressedrow.rindex; 299 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 300 for (i=0; i<mbs; i++) { 301 n = ii[i+1] - ii[i]; 302 v = a->a + ii[i]; 303 idx = a->j + ii[i]; 304 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 305 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 306 sum = 0.0; 307 PetscSparseDensePlusDot(sum,x,v,idx,n); 308 z[ridx[i]] = sum; 309 } 310 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 311 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 312 } else { 313 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr); 314 } 315 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 316 PetscFunctionReturn(0); 317 } 318 #else 319 #undef __FUNCT__ 320 #define __FUNCT__ "MatMult_SeqBAIJ_1" 321 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 322 { 323 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 324 PetscScalar *z,sum; 325 const PetscScalar *x; 326 const MatScalar *v; 327 PetscErrorCode ierr; 328 PetscInt mbs,i,n; 329 const PetscInt *idx,*ii,*ridx=NULL; 330 PetscBool usecprow=a->compressedrow.use; 331 332 PetscFunctionBegin; 333 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 334 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 335 336 if (usecprow) { 337 mbs = a->compressedrow.nrows; 338 ii = a->compressedrow.i; 339 ridx = a->compressedrow.rindex; 340 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 341 } else { 342 mbs = a->mbs; 343 ii = a->i; 344 } 345 346 for (i=0; i<mbs; i++) { 347 n = ii[1] - ii[0]; 348 v = a->a + ii[0]; 349 idx = a->j + ii[0]; 350 ii++; 351 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 352 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 353 sum = 0.0; 354 PetscSparseDensePlusDot(sum,x,v,idx,n); 355 if (usecprow) { 356 z[ridx[i]] = sum; 357 } else { 358 z[i] = sum; 359 } 360 } 361 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 362 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 363 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 364 PetscFunctionReturn(0); 365 } 366 #endif 367 368 #if defined(PETSC_THREADCOMM_ACTIVE) 369 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 370 { 371 PetscErrorCode ierr; 372 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 373 PetscScalar *z,x1,x2,sum1,sum2; 374 const PetscScalar *x,*xb; 375 const MatScalar *aa; 376 PetscInt *trstarts=A->rmap->trstarts; 377 PetscInt n,start,end,i,j; 378 const PetscInt *aj,*ai; 379 380 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 381 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 382 start = trstarts[thread_id] / 2; 383 end = trstarts[thread_id+1] / 2; 384 ai = a->i; 385 for (i=start; i<end; i++) { 386 n = ai[i+1] - ai[i]; 387 aj = a->j + ai[i]; 388 aa = a->a + ai[i]*4; 389 sum1 = 0.0; sum2 = 0.0; 390 for (j=0; j<n; j++) { 391 xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1]; 392 sum1 += aa[4*j]*x1 + aa[4*j+2]*x2; 393 sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2; 394 } 395 z[2*i] = sum1; z[2*i+1] = sum2; 396 } 397 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 398 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 399 return 0; 400 } 401 402 #undef __FUNCT__ 403 #define __FUNCT__ "MatMult_SeqBAIJ_2" 404 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 405 { 406 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 407 PetscScalar *z,x1,x2,sum1,sum2; 408 const PetscScalar *x,*xb; 409 const MatScalar *v; 410 PetscErrorCode ierr; 411 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 412 PetscBool usecprow=a->compressedrow.use; 413 414 PetscFunctionBegin; 415 416 if (usecprow) { 417 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 418 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 419 mbs = a->compressedrow.nrows; 420 ii = a->compressedrow.i; 421 ridx = a->compressedrow.rindex; 422 for (i=0; i<mbs; i++) { 423 n = ii[i+1] - ii[i]; 424 idx = a->j + ii[i]; 425 v = a->a + ii[i]*4; 426 sum1 = 0.0; sum2 = 0.0; 427 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 428 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 429 for (j=0; j<n; j++) { 430 xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1]; 431 sum1 += v[4*j]*x1 + v[4*j+2]*x2; 432 sum2 += v[4*j+1]*x1 + v[4*j+3]*x2; 433 } 434 z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2; 435 } 436 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 437 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 438 } else { 439 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr); 440 } 441 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 442 PetscFunctionReturn(0); 443 } 444 #else 445 #undef __FUNCT__ 446 #define __FUNCT__ "MatMult_SeqBAIJ_2" 447 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 448 { 449 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 450 PetscScalar *z = 0,sum1,sum2,*zarray; 451 const PetscScalar *x,*xb; 452 PetscScalar x1,x2; 453 const MatScalar *v; 454 PetscErrorCode ierr; 455 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 456 PetscBool usecprow=a->compressedrow.use; 457 458 PetscFunctionBegin; 459 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 460 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 461 462 idx = a->j; 463 v = a->a; 464 if (usecprow) { 465 mbs = a->compressedrow.nrows; 466 ii = a->compressedrow.i; 467 ridx = a->compressedrow.rindex; 468 } else { 469 mbs = a->mbs; 470 ii = a->i; 471 z = zarray; 472 } 473 474 for (i=0; i<mbs; i++) { 475 n = ii[1] - ii[0]; ii++; 476 sum1 = 0.0; sum2 = 0.0; 477 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 478 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 479 for (j=0; j<n; j++) { 480 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 481 sum1 += v[0]*x1 + v[2]*x2; 482 sum2 += v[1]*x1 + v[3]*x2; 483 v += 4; 484 } 485 if (usecprow) z = zarray + 2*ridx[i]; 486 z[0] = sum1; z[1] = sum2; 487 if (!usecprow) z += 2; 488 } 489 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 490 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 491 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 492 PetscFunctionReturn(0); 493 } 494 #endif 495 496 #if defined(PETSC_THREADCOMM_ACTIVE) 497 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 498 { 499 PetscErrorCode ierr; 500 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 501 PetscScalar *z,x1,x2,x3,sum1,sum2,sum3; 502 const PetscScalar *x,*xb; 503 const MatScalar *aa; 504 PetscInt *trstarts=A->rmap->trstarts; 505 PetscInt n,start,end,i,j; 506 const PetscInt *aj,*ai; 507 508 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 509 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 510 start = trstarts[thread_id] / 3; 511 end = trstarts[thread_id+1] / 3; 512 ai = a->i; 513 for (i=start; i<end; i++) { 514 n = ai[i+1] - ai[i]; 515 aj = a->j + ai[i]; 516 aa = a->a + ai[i]*9; 517 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 518 for (j=0; j<n; j++) { 519 xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 520 sum1 += aa[9*j]*x1 + aa[9*j+3]*x2 + aa[9*j+6]*x3; 521 sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3; 522 sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3; 523 } 524 z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3; 525 } 526 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 527 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 528 return 0; 529 } 530 531 #undef __FUNCT__ 532 #define __FUNCT__ "MatMult_SeqBAIJ_3" 533 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 534 { 535 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 536 PetscScalar *z,sum1,sum2,sum3,x1,x2,x3; 537 const PetscScalar *x,*xb; 538 const MatScalar *v; 539 PetscErrorCode ierr; 540 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 541 PetscBool usecprow=a->compressedrow.use; 542 543 544 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 545 #pragma disjoint(*v,*z,*xb) 546 #endif 547 548 PetscFunctionBegin; 549 550 if (usecprow) { 551 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 552 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 553 mbs = a->compressedrow.nrows; 554 ii = a->compressedrow.i; 555 ridx = a->compressedrow.rindex; 556 for (i=0; i<mbs; i++) { 557 n = ii[i+1] - ii[i]; 558 idx = a->j + ii[i]; 559 v = a->a + ii[i]*9; 560 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 561 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 562 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 563 for (j=0; j<n; j++) { 564 xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 565 sum1 += v[9*j]*x1 + v[9*j+3]*x2 + v[9*j+6]*x3; 566 sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3; 567 sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3; 568 } 569 z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3; 570 } 571 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 572 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 573 } else { 574 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr); 575 } 576 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 577 PetscFunctionReturn(0); 578 } 579 #else 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMult_SeqBAIJ_3" 582 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 583 { 584 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 585 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 586 const PetscScalar *x,*xb; 587 const MatScalar *v; 588 PetscErrorCode ierr; 589 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 590 PetscBool usecprow=a->compressedrow.use; 591 592 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 593 #pragma disjoint(*v,*z,*xb) 594 #endif 595 596 PetscFunctionBegin; 597 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 598 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 599 600 idx = a->j; 601 v = a->a; 602 if (usecprow) { 603 mbs = a->compressedrow.nrows; 604 ii = a->compressedrow.i; 605 ridx = a->compressedrow.rindex; 606 } else { 607 mbs = a->mbs; 608 ii = a->i; 609 z = zarray; 610 } 611 612 for (i=0; i<mbs; i++) { 613 n = ii[1] - ii[0]; ii++; 614 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 615 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 616 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 617 for (j=0; j<n; j++) { 618 xb = x + 3*(*idx++); 619 x1 = xb[0]; 620 x2 = xb[1]; 621 x3 = xb[2]; 622 623 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 624 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 625 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 626 v += 9; 627 } 628 if (usecprow) z = zarray + 3*ridx[i]; 629 z[0] = sum1; z[1] = sum2; z[2] = sum3; 630 if (!usecprow) z += 3; 631 } 632 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 633 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 634 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 635 PetscFunctionReturn(0); 636 } 637 #endif 638 639 #if defined(PETSC_THREADCOMM_ACTIVE) 640 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 641 { 642 PetscErrorCode ierr; 643 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 644 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 645 const PetscScalar *x,*xb; 646 const MatScalar *aa; 647 PetscInt *trstarts=A->rmap->trstarts; 648 PetscInt n,start,end,i,j; 649 const PetscInt *aj,*ai; 650 651 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 652 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 653 start = trstarts[thread_id] / 4; 654 end = trstarts[thread_id+1] / 4; 655 ai = a->i; 656 for (i=start; i<end; i++) { 657 n = ai[i+1] - ai[i]; 658 aj = a->j + ai[i]; 659 aa = a->a + ai[i]*16; 660 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 661 for (j=0; j<n; j++) { 662 xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 663 sum1 += aa[16*j]*x1 + aa[16*j+4]*x2 + aa[16*j+8]*x3 + aa[16*j+12]*x4; 664 sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3 + aa[16*j+13]*x4; 665 sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4; 666 sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4; 667 } 668 z[4*i] = sum1; z[4*i+1] = sum2; 669 z[4*i+2] = sum3; z[4*i+3] = sum4; 670 } 671 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 672 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 673 return 0; 674 } 675 676 #undef __FUNCT__ 677 #define __FUNCT__ "MatMult_SeqBAIJ_4" 678 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 679 { 680 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 681 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 682 const PetscScalar *x,*xb; 683 const MatScalar *v; 684 PetscErrorCode ierr; 685 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 686 PetscBool usecprow=a->compressedrow.use; 687 688 PetscFunctionBegin; 689 690 if (usecprow) { 691 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 692 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 693 mbs = a->compressedrow.nrows; 694 ii = a->compressedrow.i; 695 ridx = a->compressedrow.rindex; 696 for (i=0; i<mbs; i++) { 697 n = ii[i+1] - ii[1]; 698 idx = a->j + ii[i]; 699 v = a->a + ii[i]*16; 700 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 701 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 702 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 703 for (j=0; j<n; j++) { 704 xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 705 sum1 += v[16*j]*x1 + v[16*j+4]*x2 + v[16*j+8]*x3 + v[16*j+12]*x4; 706 sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3 + v[16*j+13]*x4; 707 sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4; 708 sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4; 709 } 710 z[4*ridx[i]] = sum1; z[4*ridx[i]+1] = sum2; 711 z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4; 712 } 713 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 714 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 715 } else { 716 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr); 717 } 718 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 719 PetscFunctionReturn(0); 720 } 721 #else 722 #undef __FUNCT__ 723 #define __FUNCT__ "MatMult_SeqBAIJ_4" 724 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 725 { 726 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 727 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 728 const PetscScalar *x,*xb; 729 const MatScalar *v; 730 PetscErrorCode ierr; 731 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 732 PetscBool usecprow=a->compressedrow.use; 733 734 PetscFunctionBegin; 735 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 736 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 737 738 idx = a->j; 739 v = a->a; 740 if (usecprow) { 741 mbs = a->compressedrow.nrows; 742 ii = a->compressedrow.i; 743 ridx = a->compressedrow.rindex; 744 } else { 745 mbs = a->mbs; 746 ii = a->i; 747 z = zarray; 748 } 749 750 for (i=0; i<mbs; i++) { 751 n = ii[1] - ii[0]; 752 ii++; 753 sum1 = 0.0; 754 sum2 = 0.0; 755 sum3 = 0.0; 756 sum4 = 0.0; 757 758 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 759 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 760 for (j=0; j<n; j++) { 761 xb = x + 4*(*idx++); 762 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 763 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 764 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 765 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 766 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 767 v += 16; 768 } 769 if (usecprow) z = zarray + 4*ridx[i]; 770 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 771 if (!usecprow) z += 4; 772 } 773 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 774 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 775 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 776 PetscFunctionReturn(0); 777 } 778 #endif 779 780 #if defined(PETSC_THREADCOMM_ACTIVE) 781 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 782 { 783 PetscErrorCode ierr; 784 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 785 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 786 const PetscScalar *x,*xb; 787 const MatScalar *aa; 788 PetscInt *trstarts=A->rmap->trstarts; 789 PetscInt n,start,end,i,j; 790 const PetscInt *aj,*ai; 791 792 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 793 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 794 start = trstarts[thread_id] / 5; 795 end = trstarts[thread_id+1] / 5; 796 ai = a->i; 797 for (i=start; i<end; i++) { 798 n = ai[i+1] - ai[i]; 799 aj = a->j + ai[i]; 800 aa = a->a + ai[i]*25; 801 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 802 for (j=0; j<n; j++) { 803 xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 804 sum1 += aa[25*j]*x1 + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5; 805 sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5; 806 sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5; 807 sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5; 808 sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5; 809 } 810 z[5*i] = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3; 811 z[5*i+3] = sum4; z[5*i+4] = sum5; 812 } 813 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 814 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 815 return 0; 816 } 817 818 #undef __FUNCT__ 819 #define __FUNCT__ "MatMult_SeqBAIJ_5" 820 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 821 { 822 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 823 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 824 const PetscScalar *xb,*x; 825 const MatScalar *v; 826 PetscErrorCode ierr; 827 const PetscInt *idx,*ii,*ridx=NULL; 828 PetscInt mbs,i,j,n; 829 PetscBool usecprow=a->compressedrow.use; 830 831 PetscFunctionBegin; 832 833 if (usecprow) { 834 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 835 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 836 mbs = a->compressedrow.nrows; 837 ii = a->compressedrow.i; 838 ridx = a->compressedrow.rindex; 839 for (i=0; i<mbs; i++) { 840 n = ii[i+1] - ii[i]; 841 idx = a->j + ii[i]; 842 v = a->a + ii[i]*25; 843 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 844 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 845 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 846 for (j=0; j<n; j++) { 847 xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 848 sum1 += v[25*j]*x1 + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5; 849 sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5; 850 sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5; 851 sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5; 852 sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5; 853 } 854 z[5*ridx[i]] = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3; 855 z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5; 856 } 857 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 858 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 859 } else { 860 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr); 861 } 862 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 863 PetscFunctionReturn(0); 864 } 865 #else 866 #undef __FUNCT__ 867 #define __FUNCT__ "MatMult_SeqBAIJ_5" 868 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 869 { 870 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 871 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 872 const PetscScalar *xb,*x; 873 const MatScalar *v; 874 PetscErrorCode ierr; 875 const PetscInt *idx,*ii,*ridx=NULL; 876 PetscInt mbs,i,j,n; 877 PetscBool usecprow=a->compressedrow.use; 878 879 PetscFunctionBegin; 880 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 881 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 882 883 idx = a->j; 884 v = a->a; 885 if (usecprow) { 886 mbs = a->compressedrow.nrows; 887 ii = a->compressedrow.i; 888 ridx = a->compressedrow.rindex; 889 } else { 890 mbs = a->mbs; 891 ii = a->i; 892 z = zarray; 893 } 894 895 for (i=0; i<mbs; i++) { 896 n = ii[1] - ii[0]; ii++; 897 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 898 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 899 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 900 for (j=0; j<n; j++) { 901 xb = x + 5*(*idx++); 902 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 903 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 904 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 905 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 906 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 907 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 908 v += 25; 909 } 910 if (usecprow) z = zarray + 5*ridx[i]; 911 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 912 if (!usecprow) z += 5; 913 } 914 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 915 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 916 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 917 PetscFunctionReturn(0); 918 } 919 #endif 920 921 922 #if defined(PETSC_THREADCOMM_ACTIVE) 923 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 924 { 925 PetscErrorCode ierr; 926 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 927 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 928 const PetscScalar *x,*xb; 929 const MatScalar *aa; 930 PetscInt *trstarts=A->rmap->trstarts; 931 PetscInt n,start,end,i,j; 932 const PetscInt *aj,*ai; 933 934 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 935 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 936 start = trstarts[thread_id] / 6; 937 end = trstarts[thread_id+1] / 6; 938 ai = a->i; 939 for (i=start; i<end; i++) { 940 n = ai[i+1] - ai[i]; 941 aj = a->j + ai[i]; 942 aa = a->a + ai[i]*36; 943 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 944 for (j=0; j<n; j++) { 945 xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 946 sum1 += aa[36*j]*x1 + aa[36*j+6]*x2 + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6; 947 sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2 + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6; 948 sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2 + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6; 949 sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2 + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6; 950 sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6; 951 sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6; 952 } 953 z[6*i] = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3; 954 z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6; 955 } 956 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 957 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 958 return 0; 959 } 960 961 #undef __FUNCT__ 962 #define __FUNCT__ "MatMult_SeqBAIJ_6" 963 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 964 { 965 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 966 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 967 const PetscScalar *x,*xb; 968 const MatScalar *v; 969 PetscErrorCode ierr; 970 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 971 PetscBool usecprow=a->compressedrow.use; 972 973 PetscFunctionBegin; 974 975 if (usecprow) { 976 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 977 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 978 mbs = a->compressedrow.nrows; 979 ii = a->compressedrow.i; 980 ridx = a->compressedrow.rindex; 981 for (i=0; i<mbs; i++) { 982 n = ii[i+1] - ii[i]; 983 idx = a->j + ii[i]; 984 v = a->a + ii[i]*36; 985 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 986 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 987 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 988 for (j=0; j<n; j++) { 989 xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 990 sum1 += v[36*j]*x1 + v[36*j+6]*x2 + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6; 991 sum2 += v[36*j+1]*x1 + v[36*j+7]*x2 + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6; 992 sum3 += v[36*j+2]*x1 + v[36*j+8]*x2 + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6; 993 sum4 += v[36*j+3]*x1 + v[36*j+9]*x2 + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6; 994 sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6; 995 sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6; 996 } 997 z[6*ridx[i]] = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3; 998 z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6; 999 } 1000 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1001 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1002 } else { 1003 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr); 1004 } 1005 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 1006 PetscFunctionReturn(0); 1007 } 1008 #else 1009 #undef __FUNCT__ 1010 #define __FUNCT__ "MatMult_SeqBAIJ_6" 1011 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 1012 { 1013 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1014 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1015 const PetscScalar *x,*xb; 1016 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 1017 const MatScalar *v; 1018 PetscErrorCode ierr; 1019 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1020 PetscBool usecprow=a->compressedrow.use; 1021 1022 PetscFunctionBegin; 1023 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1024 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1025 1026 idx = a->j; 1027 v = a->a; 1028 if (usecprow) { 1029 mbs = a->compressedrow.nrows; 1030 ii = a->compressedrow.i; 1031 ridx = a->compressedrow.rindex; 1032 } else { 1033 mbs = a->mbs; 1034 ii = a->i; 1035 z = zarray; 1036 } 1037 1038 for (i=0; i<mbs; i++) { 1039 n = ii[1] - ii[0]; 1040 ii++; 1041 sum1 = 0.0; 1042 sum2 = 0.0; 1043 sum3 = 0.0; 1044 sum4 = 0.0; 1045 sum5 = 0.0; 1046 sum6 = 0.0; 1047 1048 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1049 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1050 for (j=0; j<n; j++) { 1051 xb = x + 6*(*idx++); 1052 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1053 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1054 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1055 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1056 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1057 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1058 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1059 v += 36; 1060 } 1061 if (usecprow) z = zarray + 6*ridx[i]; 1062 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1063 if (!usecprow) z += 6; 1064 } 1065 1066 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1067 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1068 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 1069 PetscFunctionReturn(0); 1070 } 1071 #endif 1072 1073 #if defined(PETSC_THREADCOMM_ACTIVE) 1074 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 1075 { 1076 PetscErrorCode ierr; 1077 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1078 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1079 const PetscScalar *x,*xb; 1080 const MatScalar *aa; 1081 PetscInt *trstarts=A->rmap->trstarts; 1082 PetscInt n,start,end,i,j; 1083 const PetscInt *aj,*ai; 1084 1085 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1086 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1087 start = trstarts[thread_id] / 7; 1088 end = trstarts[thread_id+1] / 7; 1089 ai = a->i; 1090 for (i=start; i<end; i++) { 1091 n = ai[i+1] - ai[i]; 1092 aj = a->j + ai[i]; 1093 aa = a->a + ai[i]*49; 1094 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1095 for (j=0; j<n; j++) { 1096 xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1097 sum1 += aa[49*j]*x1 + aa[49*j+7]*x2 + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7; 1098 sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2 + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7; 1099 sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2 + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7; 1100 sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7; 1101 sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7; 1102 sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7; 1103 sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7; 1104 } 1105 z[7*i] = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4; 1106 z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7; 1107 } 1108 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1109 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1110 return 0; 1111 } 1112 1113 #undef __FUNCT__ 1114 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1115 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1116 { 1117 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1118 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1119 const PetscScalar *x,*xb; 1120 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1121 const MatScalar *v; 1122 PetscErrorCode ierr; 1123 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1124 PetscBool usecprow=a->compressedrow.use; 1125 1126 PetscFunctionBegin; 1127 1128 if (usecprow) { 1129 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1130 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1131 mbs = a->compressedrow.nrows; 1132 ii = a->compressedrow.i; 1133 ridx = a->compressedrow.rindex; 1134 for (i=0; i<mbs; i++) { 1135 n = ii[i+1] - ii[i]; 1136 idx = a->j + ii[i]; 1137 v = a->a + ii[i]*49; 1138 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1139 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1140 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1141 for (j=0; j<n; j++) { 1142 xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1143 sum1 += v[49*j]*x1 + v[49*j+7]*x2 + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7; 1144 sum2 += v[49*j+1]*x1 + v[49*j+8]*x2 + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7; 1145 sum3 += v[49*j+2]*x1 + v[49*j+9]*x2 + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7; 1146 sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7; 1147 sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7; 1148 sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7; 1149 sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7; 1150 } 1151 z[7*ridx[i]] = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4; 1152 z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7; 1153 } 1154 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1155 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1156 } else { 1157 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr); 1158 } 1159 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1160 PetscFunctionReturn(0); 1161 } 1162 #else 1163 #undef __FUNCT__ 1164 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1165 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1166 { 1167 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1168 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1169 const PetscScalar *x,*xb; 1170 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1171 const MatScalar *v; 1172 PetscErrorCode ierr; 1173 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1174 PetscBool usecprow=a->compressedrow.use; 1175 1176 PetscFunctionBegin; 1177 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1178 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1179 1180 idx = a->j; 1181 v = a->a; 1182 if (usecprow) { 1183 mbs = a->compressedrow.nrows; 1184 ii = a->compressedrow.i; 1185 ridx = a->compressedrow.rindex; 1186 } else { 1187 mbs = a->mbs; 1188 ii = a->i; 1189 z = zarray; 1190 } 1191 1192 for (i=0; i<mbs; i++) { 1193 n = ii[1] - ii[0]; 1194 ii++; 1195 sum1 = 0.0; 1196 sum2 = 0.0; 1197 sum3 = 0.0; 1198 sum4 = 0.0; 1199 sum5 = 0.0; 1200 sum6 = 0.0; 1201 sum7 = 0.0; 1202 1203 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1204 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1205 for (j=0; j<n; j++) { 1206 xb = x + 7*(*idx++); 1207 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1208 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1209 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1210 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1211 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1212 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1213 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1214 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1215 v += 49; 1216 } 1217 if (usecprow) z = zarray + 7*ridx[i]; 1218 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1219 if (!usecprow) z += 7; 1220 } 1221 1222 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1223 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1224 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1225 PetscFunctionReturn(0); 1226 } 1227 #endif 1228 1229 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1230 /* Default MatMult for block size 15 */ 1231 1232 #undef __FUNCT__ 1233 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 1234 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1235 { 1236 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1237 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1238 const PetscScalar *x,*xb; 1239 PetscScalar *zarray,xv; 1240 const MatScalar *v; 1241 PetscErrorCode ierr; 1242 const PetscInt *ii,*ij=a->j,*idx; 1243 PetscInt mbs,i,j,k,n,*ridx=NULL; 1244 PetscBool usecprow=a->compressedrow.use; 1245 1246 PetscFunctionBegin; 1247 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1248 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1249 1250 v = a->a; 1251 if (usecprow) { 1252 mbs = a->compressedrow.nrows; 1253 ii = a->compressedrow.i; 1254 ridx = a->compressedrow.rindex; 1255 } else { 1256 mbs = a->mbs; 1257 ii = a->i; 1258 z = zarray; 1259 } 1260 1261 for (i=0; i<mbs; i++) { 1262 n = ii[i+1] - ii[i]; 1263 idx = ij + ii[i]; 1264 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1265 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1266 1267 for (j=0; j<n; j++) { 1268 xb = x + 15*(idx[j]); 1269 1270 for (k=0; k<15; k++) { 1271 xv = xb[k]; 1272 sum1 += v[0]*xv; 1273 sum2 += v[1]*xv; 1274 sum3 += v[2]*xv; 1275 sum4 += v[3]*xv; 1276 sum5 += v[4]*xv; 1277 sum6 += v[5]*xv; 1278 sum7 += v[6]*xv; 1279 sum8 += v[7]*xv; 1280 sum9 += v[8]*xv; 1281 sum10 += v[9]*xv; 1282 sum11 += v[10]*xv; 1283 sum12 += v[11]*xv; 1284 sum13 += v[12]*xv; 1285 sum14 += v[13]*xv; 1286 sum15 += v[14]*xv; 1287 v += 15; 1288 } 1289 } 1290 if (usecprow) z = zarray + 15*ridx[i]; 1291 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1292 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1293 1294 if (!usecprow) z += 15; 1295 } 1296 1297 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1298 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1299 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1300 PetscFunctionReturn(0); 1301 } 1302 1303 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1304 #undef __FUNCT__ 1305 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 1306 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1307 { 1308 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1309 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1310 const PetscScalar *x,*xb; 1311 PetscScalar x1,x2,x3,x4,*zarray; 1312 const MatScalar *v; 1313 PetscErrorCode ierr; 1314 const PetscInt *ii,*ij=a->j,*idx; 1315 PetscInt mbs,i,j,n,*ridx=NULL; 1316 PetscBool usecprow=a->compressedrow.use; 1317 1318 PetscFunctionBegin; 1319 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1320 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1321 1322 v = a->a; 1323 if (usecprow) { 1324 mbs = a->compressedrow.nrows; 1325 ii = a->compressedrow.i; 1326 ridx = a->compressedrow.rindex; 1327 } else { 1328 mbs = a->mbs; 1329 ii = a->i; 1330 z = zarray; 1331 } 1332 1333 for (i=0; i<mbs; i++) { 1334 n = ii[i+1] - ii[i]; 1335 idx = ij + ii[i]; 1336 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1337 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1338 1339 for (j=0; j<n; j++) { 1340 xb = x + 15*(idx[j]); 1341 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1342 1343 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1344 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1345 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1346 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1347 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1348 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1349 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1350 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1351 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1352 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1353 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1354 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1355 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1356 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1357 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1358 1359 v += 60; 1360 1361 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1362 1363 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1364 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1365 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1366 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1367 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1368 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1369 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1370 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1371 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1372 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1373 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1374 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1375 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1376 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1377 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1378 v += 60; 1379 1380 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1381 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1382 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1383 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1384 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1385 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1386 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1387 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1388 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1389 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1390 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1391 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1392 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1393 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1394 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1395 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1396 v += 60; 1397 1398 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1399 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1400 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1401 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1402 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1403 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1404 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1405 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1406 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1407 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1408 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1409 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1410 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1411 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1412 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1413 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1414 v += 45; 1415 } 1416 if (usecprow) z = zarray + 15*ridx[i]; 1417 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1418 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1419 1420 if (!usecprow) z += 15; 1421 } 1422 1423 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1424 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1425 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1426 PetscFunctionReturn(0); 1427 } 1428 1429 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1430 #undef __FUNCT__ 1431 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 1432 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1433 { 1434 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1435 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1436 const PetscScalar *x,*xb; 1437 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1438 const MatScalar *v; 1439 PetscErrorCode ierr; 1440 const PetscInt *ii,*ij=a->j,*idx; 1441 PetscInt mbs,i,j,n,*ridx=NULL; 1442 PetscBool usecprow=a->compressedrow.use; 1443 1444 PetscFunctionBegin; 1445 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1446 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1447 1448 v = a->a; 1449 if (usecprow) { 1450 mbs = a->compressedrow.nrows; 1451 ii = a->compressedrow.i; 1452 ridx = a->compressedrow.rindex; 1453 } else { 1454 mbs = a->mbs; 1455 ii = a->i; 1456 z = zarray; 1457 } 1458 1459 for (i=0; i<mbs; i++) { 1460 n = ii[i+1] - ii[i]; 1461 idx = ij + ii[i]; 1462 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1463 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1464 1465 for (j=0; j<n; j++) { 1466 xb = x + 15*(idx[j]); 1467 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1468 x8 = xb[7]; 1469 1470 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1471 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1472 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1473 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1474 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1475 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1476 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1477 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1478 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1479 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1480 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1481 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1482 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1483 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1484 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1485 v += 120; 1486 1487 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1488 1489 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1490 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1491 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1492 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1493 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1494 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1495 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1496 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1497 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1498 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1499 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1500 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1501 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1502 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1503 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1504 v += 105; 1505 } 1506 if (usecprow) z = zarray + 15*ridx[i]; 1507 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1508 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1509 1510 if (!usecprow) z += 15; 1511 } 1512 1513 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1514 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1515 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1516 PetscFunctionReturn(0); 1517 } 1518 1519 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1520 1521 #undef __FUNCT__ 1522 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 1523 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1524 { 1525 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1526 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1527 const PetscScalar *x,*xb; 1528 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1529 const MatScalar *v; 1530 PetscErrorCode ierr; 1531 const PetscInt *ii,*ij=a->j,*idx; 1532 PetscInt mbs,i,j,n,*ridx=NULL; 1533 PetscBool usecprow=a->compressedrow.use; 1534 1535 PetscFunctionBegin; 1536 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1537 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1538 1539 v = a->a; 1540 if (usecprow) { 1541 mbs = a->compressedrow.nrows; 1542 ii = a->compressedrow.i; 1543 ridx = a->compressedrow.rindex; 1544 } else { 1545 mbs = a->mbs; 1546 ii = a->i; 1547 z = zarray; 1548 } 1549 1550 for (i=0; i<mbs; i++) { 1551 n = ii[i+1] - ii[i]; 1552 idx = ij + ii[i]; 1553 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1554 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1555 1556 for (j=0; j<n; j++) { 1557 xb = x + 15*(idx[j]); 1558 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1559 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1560 1561 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1562 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1563 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1564 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1565 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1566 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1567 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1568 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1569 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1570 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1571 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1572 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1573 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1574 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1575 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1576 v += 225; 1577 } 1578 if (usecprow) z = zarray + 15*ridx[i]; 1579 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1580 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1581 1582 if (!usecprow) z += 15; 1583 } 1584 1585 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1586 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1587 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1588 PetscFunctionReturn(0); 1589 } 1590 1591 1592 /* 1593 This will not work with MatScalar == float because it calls the BLAS 1594 */ 1595 #undef __FUNCT__ 1596 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1597 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1598 { 1599 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1600 PetscScalar *z = 0,*work,*workt,*zarray; 1601 const PetscScalar *x,*xb; 1602 const MatScalar *v; 1603 PetscErrorCode ierr; 1604 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 1605 const PetscInt *idx,*ii,*ridx=NULL; 1606 PetscInt ncols,k; 1607 PetscBool usecprow=a->compressedrow.use; 1608 1609 PetscFunctionBegin; 1610 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1611 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1612 1613 idx = a->j; 1614 v = a->a; 1615 if (usecprow) { 1616 mbs = a->compressedrow.nrows; 1617 ii = a->compressedrow.i; 1618 ridx = a->compressedrow.rindex; 1619 } else { 1620 mbs = a->mbs; 1621 ii = a->i; 1622 z = zarray; 1623 } 1624 1625 if (!a->mult_work) { 1626 k = PetscMax(A->rmap->n,A->cmap->n); 1627 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 1628 } 1629 work = a->mult_work; 1630 for (i=0; i<mbs; i++) { 1631 n = ii[1] - ii[0]; ii++; 1632 ncols = n*bs; 1633 workt = work; 1634 for (j=0; j<n; j++) { 1635 xb = x + bs*(*idx++); 1636 for (k=0; k<bs; k++) workt[k] = xb[k]; 1637 workt += bs; 1638 } 1639 if (usecprow) z = zarray + bs*ridx[i]; 1640 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1641 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1642 v += n*bs2; 1643 if (!usecprow) z += bs; 1644 } 1645 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1646 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1647 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1648 PetscFunctionReturn(0); 1649 } 1650 1651 #undef __FUNCT__ 1652 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1653 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1654 { 1655 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1656 const PetscScalar *x; 1657 PetscScalar *y,*z,sum; 1658 const MatScalar *v; 1659 PetscErrorCode ierr; 1660 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1661 const PetscInt *idx,*ii; 1662 PetscBool usecprow=a->compressedrow.use; 1663 1664 PetscFunctionBegin; 1665 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1666 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1667 1668 idx = a->j; 1669 v = a->a; 1670 if (usecprow) { 1671 if (zz != yy) { 1672 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1673 } 1674 mbs = a->compressedrow.nrows; 1675 ii = a->compressedrow.i; 1676 ridx = a->compressedrow.rindex; 1677 } else { 1678 ii = a->i; 1679 } 1680 1681 for (i=0; i<mbs; i++) { 1682 n = ii[1] - ii[0]; 1683 ii++; 1684 if (!usecprow) { 1685 sum = y[i]; 1686 } else { 1687 sum = y[ridx[i]]; 1688 } 1689 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1690 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1691 PetscSparseDensePlusDot(sum,x,v,idx,n); 1692 v += n; 1693 idx += n; 1694 if (usecprow) { 1695 z[ridx[i]] = sum; 1696 } else { 1697 z[i] = sum; 1698 } 1699 } 1700 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1701 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 1702 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1703 PetscFunctionReturn(0); 1704 } 1705 1706 #undef __FUNCT__ 1707 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1708 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1709 { 1710 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1711 PetscScalar *y = 0,*z = 0,sum1,sum2; 1712 const PetscScalar *x,*xb; 1713 PetscScalar x1,x2,*yarray,*zarray; 1714 const MatScalar *v; 1715 PetscErrorCode ierr; 1716 PetscInt mbs = a->mbs,i,n,j; 1717 const PetscInt *idx,*ii,*ridx = NULL; 1718 PetscBool usecprow = a->compressedrow.use; 1719 1720 PetscFunctionBegin; 1721 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1722 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1723 1724 idx = a->j; 1725 v = a->a; 1726 if (usecprow) { 1727 if (zz != yy) { 1728 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1729 } 1730 mbs = a->compressedrow.nrows; 1731 ii = a->compressedrow.i; 1732 ridx = a->compressedrow.rindex; 1733 if (zz != yy) { 1734 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1735 } 1736 } else { 1737 ii = a->i; 1738 y = yarray; 1739 z = zarray; 1740 } 1741 1742 for (i=0; i<mbs; i++) { 1743 n = ii[1] - ii[0]; ii++; 1744 if (usecprow) { 1745 z = zarray + 2*ridx[i]; 1746 y = yarray + 2*ridx[i]; 1747 } 1748 sum1 = y[0]; sum2 = y[1]; 1749 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1750 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1751 for (j=0; j<n; j++) { 1752 xb = x + 2*(*idx++); 1753 x1 = xb[0]; 1754 x2 = xb[1]; 1755 1756 sum1 += v[0]*x1 + v[2]*x2; 1757 sum2 += v[1]*x1 + v[3]*x2; 1758 v += 4; 1759 } 1760 z[0] = sum1; z[1] = sum2; 1761 if (!usecprow) { 1762 z += 2; y += 2; 1763 } 1764 } 1765 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1766 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1767 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1768 PetscFunctionReturn(0); 1769 } 1770 1771 #undef __FUNCT__ 1772 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1773 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1774 { 1775 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1776 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1777 const PetscScalar *x,*xb; 1778 const MatScalar *v; 1779 PetscErrorCode ierr; 1780 PetscInt mbs = a->mbs,i,j,n; 1781 const PetscInt *idx,*ii,*ridx = NULL; 1782 PetscBool usecprow = a->compressedrow.use; 1783 1784 PetscFunctionBegin; 1785 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1786 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1787 1788 idx = a->j; 1789 v = a->a; 1790 if (usecprow) { 1791 if (zz != yy) { 1792 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1793 } 1794 mbs = a->compressedrow.nrows; 1795 ii = a->compressedrow.i; 1796 ridx = a->compressedrow.rindex; 1797 } else { 1798 ii = a->i; 1799 y = yarray; 1800 z = zarray; 1801 } 1802 1803 for (i=0; i<mbs; i++) { 1804 n = ii[1] - ii[0]; ii++; 1805 if (usecprow) { 1806 z = zarray + 3*ridx[i]; 1807 y = yarray + 3*ridx[i]; 1808 } 1809 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1810 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1811 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1812 for (j=0; j<n; j++) { 1813 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1814 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1815 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1816 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1817 v += 9; 1818 } 1819 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1820 if (!usecprow) { 1821 z += 3; y += 3; 1822 } 1823 } 1824 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1825 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1826 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1827 PetscFunctionReturn(0); 1828 } 1829 1830 #undef __FUNCT__ 1831 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1832 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1833 { 1834 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1835 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1836 const PetscScalar *x,*xb; 1837 const MatScalar *v; 1838 PetscErrorCode ierr; 1839 PetscInt mbs = a->mbs,i,j,n; 1840 const PetscInt *idx,*ii,*ridx=NULL; 1841 PetscBool usecprow=a->compressedrow.use; 1842 1843 PetscFunctionBegin; 1844 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1845 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1846 1847 idx = a->j; 1848 v = a->a; 1849 if (usecprow) { 1850 if (zz != yy) { 1851 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1852 } 1853 mbs = a->compressedrow.nrows; 1854 ii = a->compressedrow.i; 1855 ridx = a->compressedrow.rindex; 1856 } else { 1857 ii = a->i; 1858 y = yarray; 1859 z = zarray; 1860 } 1861 1862 for (i=0; i<mbs; i++) { 1863 n = ii[1] - ii[0]; ii++; 1864 if (usecprow) { 1865 z = zarray + 4*ridx[i]; 1866 y = yarray + 4*ridx[i]; 1867 } 1868 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1869 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1870 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1871 for (j=0; j<n; j++) { 1872 xb = x + 4*(*idx++); 1873 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1874 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1875 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1876 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1877 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1878 v += 16; 1879 } 1880 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1881 if (!usecprow) { 1882 z += 4; y += 4; 1883 } 1884 } 1885 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1886 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1887 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1888 PetscFunctionReturn(0); 1889 } 1890 1891 #undef __FUNCT__ 1892 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1893 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1894 { 1895 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1896 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1897 const PetscScalar *x,*xb; 1898 PetscScalar *yarray,*zarray; 1899 const MatScalar *v; 1900 PetscErrorCode ierr; 1901 PetscInt mbs = a->mbs,i,j,n; 1902 const PetscInt *idx,*ii,*ridx = NULL; 1903 PetscBool usecprow=a->compressedrow.use; 1904 1905 PetscFunctionBegin; 1906 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1907 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1908 1909 idx = a->j; 1910 v = a->a; 1911 if (usecprow) { 1912 if (zz != yy) { 1913 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1914 } 1915 mbs = a->compressedrow.nrows; 1916 ii = a->compressedrow.i; 1917 ridx = a->compressedrow.rindex; 1918 } else { 1919 ii = a->i; 1920 y = yarray; 1921 z = zarray; 1922 } 1923 1924 for (i=0; i<mbs; i++) { 1925 n = ii[1] - ii[0]; ii++; 1926 if (usecprow) { 1927 z = zarray + 5*ridx[i]; 1928 y = yarray + 5*ridx[i]; 1929 } 1930 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1931 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1932 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1933 for (j=0; j<n; j++) { 1934 xb = x + 5*(*idx++); 1935 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1936 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1937 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1938 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1939 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1940 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1941 v += 25; 1942 } 1943 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1944 if (!usecprow) { 1945 z += 5; y += 5; 1946 } 1947 } 1948 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1949 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1950 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1951 PetscFunctionReturn(0); 1952 } 1953 #undef __FUNCT__ 1954 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1955 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1956 { 1957 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1958 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 1959 const PetscScalar *x,*xb; 1960 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1961 const MatScalar *v; 1962 PetscErrorCode ierr; 1963 PetscInt mbs = a->mbs,i,j,n; 1964 const PetscInt *idx,*ii,*ridx=NULL; 1965 PetscBool usecprow=a->compressedrow.use; 1966 1967 PetscFunctionBegin; 1968 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1969 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 1970 1971 idx = a->j; 1972 v = a->a; 1973 if (usecprow) { 1974 if (zz != yy) { 1975 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1976 } 1977 mbs = a->compressedrow.nrows; 1978 ii = a->compressedrow.i; 1979 ridx = a->compressedrow.rindex; 1980 } else { 1981 ii = a->i; 1982 y = yarray; 1983 z = zarray; 1984 } 1985 1986 for (i=0; i<mbs; i++) { 1987 n = ii[1] - ii[0]; ii++; 1988 if (usecprow) { 1989 z = zarray + 6*ridx[i]; 1990 y = yarray + 6*ridx[i]; 1991 } 1992 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 1993 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1994 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1995 for (j=0; j<n; j++) { 1996 xb = x + 6*(*idx++); 1997 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1998 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1999 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2000 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2001 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2002 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2003 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2004 v += 36; 2005 } 2006 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2007 if (!usecprow) { 2008 z += 6; y += 6; 2009 } 2010 } 2011 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2012 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2013 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 2014 PetscFunctionReturn(0); 2015 } 2016 2017 #undef __FUNCT__ 2018 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 2019 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2020 { 2021 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2022 PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2023 const PetscScalar *x,*xb; 2024 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2025 const MatScalar *v; 2026 PetscErrorCode ierr; 2027 PetscInt mbs = a->mbs,i,j,n; 2028 const PetscInt *idx,*ii,*ridx = NULL; 2029 PetscBool usecprow=a->compressedrow.use; 2030 2031 PetscFunctionBegin; 2032 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2033 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2034 2035 idx = a->j; 2036 v = a->a; 2037 if (usecprow) { 2038 if (zz != yy) { 2039 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2040 } 2041 mbs = a->compressedrow.nrows; 2042 ii = a->compressedrow.i; 2043 ridx = a->compressedrow.rindex; 2044 } else { 2045 ii = a->i; 2046 y = yarray; 2047 z = zarray; 2048 } 2049 2050 for (i=0; i<mbs; i++) { 2051 n = ii[1] - ii[0]; ii++; 2052 if (usecprow) { 2053 z = zarray + 7*ridx[i]; 2054 y = yarray + 7*ridx[i]; 2055 } 2056 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2057 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2058 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2059 for (j=0; j<n; j++) { 2060 xb = x + 7*(*idx++); 2061 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2062 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2063 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2064 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2065 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2066 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2067 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2068 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2069 v += 49; 2070 } 2071 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2072 if (!usecprow) { 2073 z += 7; y += 7; 2074 } 2075 } 2076 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2077 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr); 2078 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 2079 PetscFunctionReturn(0); 2080 } 2081 2082 #undef __FUNCT__ 2083 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 2084 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2085 { 2086 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2087 PetscScalar *z = 0,*work,*workt,*zarray; 2088 const PetscScalar *x,*xb; 2089 const MatScalar *v; 2090 PetscErrorCode ierr; 2091 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2; 2092 PetscInt ncols,k; 2093 const PetscInt *ridx = NULL,*idx,*ii; 2094 PetscBool usecprow = a->compressedrow.use; 2095 2096 PetscFunctionBegin; 2097 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2098 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2099 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2100 2101 idx = a->j; 2102 v = a->a; 2103 if (usecprow) { 2104 mbs = a->compressedrow.nrows; 2105 ii = a->compressedrow.i; 2106 ridx = a->compressedrow.rindex; 2107 } else { 2108 mbs = a->mbs; 2109 ii = a->i; 2110 z = zarray; 2111 } 2112 2113 if (!a->mult_work) { 2114 k = PetscMax(A->rmap->n,A->cmap->n); 2115 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2116 } 2117 work = a->mult_work; 2118 for (i=0; i<mbs; i++) { 2119 n = ii[1] - ii[0]; ii++; 2120 ncols = n*bs; 2121 workt = work; 2122 for (j=0; j<n; j++) { 2123 xb = x + bs*(*idx++); 2124 for (k=0; k<bs; k++) workt[k] = xb[k]; 2125 workt += bs; 2126 } 2127 if (usecprow) z = zarray + bs*ridx[i]; 2128 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2129 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 2130 v += n*bs2; 2131 if (!usecprow) z += bs; 2132 } 2133 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2134 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2135 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 2136 PetscFunctionReturn(0); 2137 } 2138 2139 #undef __FUNCT__ 2140 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 2141 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2142 { 2143 PetscScalar zero = 0.0; 2144 PetscErrorCode ierr; 2145 2146 PetscFunctionBegin; 2147 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2148 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2149 PetscFunctionReturn(0); 2150 } 2151 2152 #undef __FUNCT__ 2153 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 2154 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2155 { 2156 PetscScalar zero = 0.0; 2157 PetscErrorCode ierr; 2158 2159 PetscFunctionBegin; 2160 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2161 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2162 PetscFunctionReturn(0); 2163 } 2164 2165 #undef __FUNCT__ 2166 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 2167 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2168 { 2169 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2170 PetscScalar *z,x1,x2,x3,x4,x5; 2171 const PetscScalar *x,*xb = NULL; 2172 const MatScalar *v; 2173 PetscErrorCode ierr; 2174 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n; 2175 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2176 Mat_CompressedRow cprow = a->compressedrow; 2177 PetscBool usecprow = cprow.use; 2178 2179 PetscFunctionBegin; 2180 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2181 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2182 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2183 2184 idx = a->j; 2185 v = a->a; 2186 if (usecprow) { 2187 mbs = cprow.nrows; 2188 ii = cprow.i; 2189 ridx = cprow.rindex; 2190 } else { 2191 mbs=a->mbs; 2192 ii = a->i; 2193 xb = x; 2194 } 2195 2196 switch (bs) { 2197 case 1: 2198 for (i=0; i<mbs; i++) { 2199 if (usecprow) xb = x + ridx[i]; 2200 x1 = xb[0]; 2201 ib = idx + ii[0]; 2202 n = ii[1] - ii[0]; ii++; 2203 for (j=0; j<n; j++) { 2204 rval = ib[j]; 2205 z[rval] += PetscConj(*v) * x1; 2206 v++; 2207 } 2208 if (!usecprow) xb++; 2209 } 2210 break; 2211 case 2: 2212 for (i=0; i<mbs; i++) { 2213 if (usecprow) xb = x + 2*ridx[i]; 2214 x1 = xb[0]; x2 = xb[1]; 2215 ib = idx + ii[0]; 2216 n = ii[1] - ii[0]; ii++; 2217 for (j=0; j<n; j++) { 2218 rval = ib[j]*2; 2219 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2220 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2221 v += 4; 2222 } 2223 if (!usecprow) xb += 2; 2224 } 2225 break; 2226 case 3: 2227 for (i=0; i<mbs; i++) { 2228 if (usecprow) xb = x + 3*ridx[i]; 2229 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2230 ib = idx + ii[0]; 2231 n = ii[1] - ii[0]; ii++; 2232 for (j=0; j<n; j++) { 2233 rval = ib[j]*3; 2234 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2235 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2236 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2237 v += 9; 2238 } 2239 if (!usecprow) xb += 3; 2240 } 2241 break; 2242 case 4: 2243 for (i=0; i<mbs; i++) { 2244 if (usecprow) xb = x + 4*ridx[i]; 2245 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2246 ib = idx + ii[0]; 2247 n = ii[1] - ii[0]; ii++; 2248 for (j=0; j<n; j++) { 2249 rval = ib[j]*4; 2250 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2251 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2252 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2253 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2254 v += 16; 2255 } 2256 if (!usecprow) xb += 4; 2257 } 2258 break; 2259 case 5: 2260 for (i=0; i<mbs; i++) { 2261 if (usecprow) xb = x + 5*ridx[i]; 2262 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2263 x4 = xb[3]; x5 = xb[4]; 2264 ib = idx + ii[0]; 2265 n = ii[1] - ii[0]; ii++; 2266 for (j=0; j<n; j++) { 2267 rval = ib[j]*5; 2268 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2269 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2270 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2271 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2272 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2273 v += 25; 2274 } 2275 if (!usecprow) xb += 5; 2276 } 2277 break; 2278 default: /* block sizes larger than 5 by 5 are handled by BLAS */ 2279 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2280 #if 0 2281 { 2282 PetscInt ncols,k,bs2=a->bs2; 2283 PetscScalar *work,*workt,zb; 2284 const PetscScalar *xtmp; 2285 if (!a->mult_work) { 2286 k = PetscMax(A->rmap->n,A->cmap->n); 2287 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2288 } 2289 work = a->mult_work; 2290 xtmp = x; 2291 for (i=0; i<mbs; i++) { 2292 n = ii[1] - ii[0]; ii++; 2293 ncols = n*bs; 2294 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2295 if (usecprow) xtmp = x + bs*ridx[i]; 2296 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2297 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2298 v += n*bs2; 2299 if (!usecprow) xtmp += bs; 2300 workt = work; 2301 for (j=0; j<n; j++) { 2302 zb = z + bs*(*idx++); 2303 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2304 workt += bs; 2305 } 2306 } 2307 } 2308 #endif 2309 } 2310 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2311 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2312 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2313 PetscFunctionReturn(0); 2314 } 2315 2316 #undef __FUNCT__ 2317 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 2318 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2319 { 2320 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2321 PetscScalar *zb,*z,x1,x2,x3,x4,x5; 2322 const PetscScalar *x,*xb = 0; 2323 const MatScalar *v; 2324 PetscErrorCode ierr; 2325 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2; 2326 const PetscInt *idx,*ii,*ib,*ridx = NULL; 2327 Mat_CompressedRow cprow = a->compressedrow; 2328 PetscBool usecprow=cprow.use; 2329 2330 PetscFunctionBegin; 2331 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2332 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 2333 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2334 2335 idx = a->j; 2336 v = a->a; 2337 if (usecprow) { 2338 mbs = cprow.nrows; 2339 ii = cprow.i; 2340 ridx = cprow.rindex; 2341 } else { 2342 mbs=a->mbs; 2343 ii = a->i; 2344 xb = x; 2345 } 2346 2347 switch (bs) { 2348 case 1: 2349 for (i=0; i<mbs; i++) { 2350 if (usecprow) xb = x + ridx[i]; 2351 x1 = xb[0]; 2352 ib = idx + ii[0]; 2353 n = ii[1] - ii[0]; ii++; 2354 for (j=0; j<n; j++) { 2355 rval = ib[j]; 2356 z[rval] += *v * x1; 2357 v++; 2358 } 2359 if (!usecprow) xb++; 2360 } 2361 break; 2362 case 2: 2363 for (i=0; i<mbs; i++) { 2364 if (usecprow) xb = x + 2*ridx[i]; 2365 x1 = xb[0]; x2 = xb[1]; 2366 ib = idx + ii[0]; 2367 n = ii[1] - ii[0]; ii++; 2368 for (j=0; j<n; j++) { 2369 rval = ib[j]*2; 2370 z[rval++] += v[0]*x1 + v[1]*x2; 2371 z[rval++] += v[2]*x1 + v[3]*x2; 2372 v += 4; 2373 } 2374 if (!usecprow) xb += 2; 2375 } 2376 break; 2377 case 3: 2378 for (i=0; i<mbs; i++) { 2379 if (usecprow) xb = x + 3*ridx[i]; 2380 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2381 ib = idx + ii[0]; 2382 n = ii[1] - ii[0]; ii++; 2383 for (j=0; j<n; j++) { 2384 rval = ib[j]*3; 2385 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2386 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2387 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2388 v += 9; 2389 } 2390 if (!usecprow) xb += 3; 2391 } 2392 break; 2393 case 4: 2394 for (i=0; i<mbs; i++) { 2395 if (usecprow) xb = x + 4*ridx[i]; 2396 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2397 ib = idx + ii[0]; 2398 n = ii[1] - ii[0]; ii++; 2399 for (j=0; j<n; j++) { 2400 rval = ib[j]*4; 2401 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2402 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2403 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2404 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2405 v += 16; 2406 } 2407 if (!usecprow) xb += 4; 2408 } 2409 break; 2410 case 5: 2411 for (i=0; i<mbs; i++) { 2412 if (usecprow) xb = x + 5*ridx[i]; 2413 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2414 x4 = xb[3]; x5 = xb[4]; 2415 ib = idx + ii[0]; 2416 n = ii[1] - ii[0]; ii++; 2417 for (j=0; j<n; j++) { 2418 rval = ib[j]*5; 2419 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2420 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2421 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2422 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2423 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2424 v += 25; 2425 } 2426 if (!usecprow) xb += 5; 2427 } 2428 break; 2429 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2430 PetscInt ncols,k; 2431 PetscScalar *work,*workt; 2432 const PetscScalar *xtmp; 2433 if (!a->mult_work) { 2434 k = PetscMax(A->rmap->n,A->cmap->n); 2435 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr); 2436 } 2437 work = a->mult_work; 2438 xtmp = x; 2439 for (i=0; i<mbs; i++) { 2440 n = ii[1] - ii[0]; ii++; 2441 ncols = n*bs; 2442 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2443 if (usecprow) xtmp = x + bs*ridx[i]; 2444 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2445 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2446 v += n*bs2; 2447 if (!usecprow) xtmp += bs; 2448 workt = work; 2449 for (j=0; j<n; j++) { 2450 zb = z + bs*(*idx++); 2451 for (k=0; k<bs; k++) zb[k] += workt[k]; 2452 workt += bs; 2453 } 2454 } 2455 } 2456 } 2457 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 2458 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2459 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2460 PetscFunctionReturn(0); 2461 } 2462 2463 #undef __FUNCT__ 2464 #define __FUNCT__ "MatScale_SeqBAIJ" 2465 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2466 { 2467 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2468 PetscInt totalnz = a->bs2*a->nz; 2469 PetscScalar oalpha = alpha; 2470 PetscErrorCode ierr; 2471 PetscBLASInt one = 1,tnz; 2472 2473 PetscFunctionBegin; 2474 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2475 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2476 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2477 PetscFunctionReturn(0); 2478 } 2479 2480 #undef __FUNCT__ 2481 #define __FUNCT__ "MatNorm_SeqBAIJ" 2482 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2483 { 2484 PetscErrorCode ierr; 2485 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2486 MatScalar *v = a->a; 2487 PetscReal sum = 0.0; 2488 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2489 2490 PetscFunctionBegin; 2491 if (type == NORM_FROBENIUS) { 2492 for (i=0; i< bs2*nz; i++) { 2493 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2494 } 2495 *norm = PetscSqrtReal(sum); 2496 } else if (type == NORM_1) { /* maximum column sum */ 2497 PetscReal *tmp; 2498 PetscInt *bcol = a->j; 2499 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr); 2500 for (i=0; i<nz; i++) { 2501 for (j=0; j<bs; j++) { 2502 k1 = bs*(*bcol) + j; /* column index */ 2503 for (k=0; k<bs; k++) { 2504 tmp[k1] += PetscAbsScalar(*v); v++; 2505 } 2506 } 2507 bcol++; 2508 } 2509 *norm = 0.0; 2510 for (j=0; j<A->cmap->n; j++) { 2511 if (tmp[j] > *norm) *norm = tmp[j]; 2512 } 2513 ierr = PetscFree(tmp);CHKERRQ(ierr); 2514 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2515 *norm = 0.0; 2516 for (k=0; k<bs; k++) { 2517 for (j=0; j<a->mbs; j++) { 2518 v = a->a + bs2*a->i[j] + k; 2519 sum = 0.0; 2520 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2521 for (k1=0; k1<bs; k1++) { 2522 sum += PetscAbsScalar(*v); 2523 v += bs; 2524 } 2525 } 2526 if (sum > *norm) *norm = sum; 2527 } 2528 } 2529 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2530 PetscFunctionReturn(0); 2531 } 2532 2533 2534 #undef __FUNCT__ 2535 #define __FUNCT__ "MatEqual_SeqBAIJ" 2536 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2537 { 2538 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2539 PetscErrorCode ierr; 2540 2541 PetscFunctionBegin; 2542 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2543 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2544 *flg = PETSC_FALSE; 2545 PetscFunctionReturn(0); 2546 } 2547 2548 /* if the a->i are the same */ 2549 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2550 if (!*flg) PetscFunctionReturn(0); 2551 2552 /* if a->j are the same */ 2553 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2554 if (!*flg) PetscFunctionReturn(0); 2555 2556 /* if a->a are the same */ 2557 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2558 PetscFunctionReturn(0); 2559 2560 } 2561 2562 #undef __FUNCT__ 2563 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 2564 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2565 { 2566 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2567 PetscErrorCode ierr; 2568 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2569 PetscScalar *x,zero = 0.0; 2570 MatScalar *aa,*aa_j; 2571 2572 PetscFunctionBegin; 2573 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2574 bs = A->rmap->bs; 2575 aa = a->a; 2576 ai = a->i; 2577 aj = a->j; 2578 ambs = a->mbs; 2579 bs2 = a->bs2; 2580 2581 ierr = VecSet(v,zero);CHKERRQ(ierr); 2582 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2583 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2584 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2585 for (i=0; i<ambs; i++) { 2586 for (j=ai[i]; j<ai[i+1]; j++) { 2587 if (aj[j] == i) { 2588 row = i*bs; 2589 aa_j = aa+j*bs2; 2590 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2591 break; 2592 } 2593 } 2594 } 2595 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2596 PetscFunctionReturn(0); 2597 } 2598 2599 #undef __FUNCT__ 2600 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2601 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2602 { 2603 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2604 const PetscScalar *l,*r,*li,*ri; 2605 PetscScalar x; 2606 MatScalar *aa, *v; 2607 PetscErrorCode ierr; 2608 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2609 const PetscInt *ai,*aj; 2610 2611 PetscFunctionBegin; 2612 ai = a->i; 2613 aj = a->j; 2614 aa = a->a; 2615 m = A->rmap->n; 2616 n = A->cmap->n; 2617 bs = A->rmap->bs; 2618 mbs = a->mbs; 2619 bs2 = a->bs2; 2620 if (ll) { 2621 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2622 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2623 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2624 for (i=0; i<mbs; i++) { /* for each block row */ 2625 M = ai[i+1] - ai[i]; 2626 li = l + i*bs; 2627 v = aa + bs2*ai[i]; 2628 for (j=0; j<M; j++) { /* for each block */ 2629 for (k=0; k<bs2; k++) { 2630 (*v++) *= li[k%bs]; 2631 } 2632 } 2633 } 2634 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2635 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2636 } 2637 2638 if (rr) { 2639 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2640 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2641 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2642 for (i=0; i<mbs; i++) { /* for each block row */ 2643 iai = ai[i]; 2644 M = ai[i+1] - iai; 2645 v = aa + bs2*iai; 2646 for (j=0; j<M; j++) { /* for each block */ 2647 ri = r + bs*aj[iai+j]; 2648 for (k=0; k<bs; k++) { 2649 x = ri[k]; 2650 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2651 v += bs; 2652 } 2653 } 2654 } 2655 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2656 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2657 } 2658 PetscFunctionReturn(0); 2659 } 2660 2661 2662 #undef __FUNCT__ 2663 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2664 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2665 { 2666 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2667 2668 PetscFunctionBegin; 2669 info->block_size = a->bs2; 2670 info->nz_allocated = a->bs2*a->maxnz; 2671 info->nz_used = a->bs2*a->nz; 2672 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2673 info->assemblies = A->num_ass; 2674 info->mallocs = A->info.mallocs; 2675 info->memory = ((PetscObject)A)->mem; 2676 if (A->factortype) { 2677 info->fill_ratio_given = A->info.fill_ratio_given; 2678 info->fill_ratio_needed = A->info.fill_ratio_needed; 2679 info->factor_mallocs = A->info.factor_mallocs; 2680 } else { 2681 info->fill_ratio_given = 0; 2682 info->fill_ratio_needed = 0; 2683 info->factor_mallocs = 0; 2684 } 2685 PetscFunctionReturn(0); 2686 } 2687 2688 2689 #if defined(PETSC_THREADCOMM_ACTIVE) 2690 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A) 2691 { 2692 PetscErrorCode ierr; 2693 PetscInt *trstarts=A->rmap->trstarts; 2694 PetscInt n,start,end; 2695 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2696 2697 start = trstarts[thread_id]/A->rmap->bs; 2698 end = trstarts[thread_id+1]/A->rmap->bs; 2699 n = a->i[end] - a->i[start]; 2700 ierr = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 2701 return 0; 2702 } 2703 2704 #undef __FUNCT__ 2705 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2706 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2707 { 2708 PetscErrorCode ierr; 2709 2710 2711 PetscFunctionBegin; 2712 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr); 2713 PetscFunctionReturn(0); 2714 } 2715 #else 2716 #undef __FUNCT__ 2717 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2718 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2719 { 2720 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2721 PetscErrorCode ierr; 2722 2723 PetscFunctionBegin; 2724 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2725 PetscFunctionReturn(0); 2726 } 2727 #endif 2728