1 2 #include <../src/mat/impls/baij/seq/baij.h> 3 #include <petsc-private/kernels/blockinvert.h> 4 #include <petscbt.h> 5 #include <petscblaslapack.h> 6 #if defined(PETSC_THREADCOMM_ACTIVE) 7 #include <petscthreadcomm.h> 8 #endif 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ" 12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 15 PetscErrorCode ierr; 16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival; 17 const PetscInt *idx; 18 PetscInt start,end,*ai,*aj,bs,*nidx2; 19 PetscBT table; 20 21 PetscFunctionBegin; 22 m = a->mbs; 23 ai = a->i; 24 aj = a->j; 25 bs = A->rmap->bs; 26 27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr); 30 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 31 ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr); 32 33 for (i=0; i<is_max; i++) { 34 /* Initialise the two local arrays */ 35 isz = 0; 36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr); 37 38 /* Extract the indices, assume there can be duplicate entries */ 39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr); 40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr); 41 42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */ 43 for (j=0; j<n; ++j) { 44 ival = idx[j]/bs; /* convert the indices into block indices */ 45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival; 47 } 48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr); 49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 50 51 k = 0; 52 for (j=0; j<ov; j++) { /* for each overlap*/ 53 n = isz; 54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */ 55 row = nidx[k]; 56 start = ai[row]; 57 end = ai[row+1]; 58 for (l = start; l<end; l++) { 59 val = aj[l]; 60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val; 61 } 62 } 63 } 64 /* expand the Index Set */ 65 for (j=0; j<isz; j++) { 66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k; 67 } 68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 69 } 70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr); 71 ierr = PetscFree(nidx);CHKERRQ(ierr); 72 ierr = PetscFree(nidx2);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 #undef __FUNCT__ 77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private" 78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 79 { 80 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c; 81 PetscErrorCode ierr; 82 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens; 83 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen; 84 const PetscInt *irow,*icol; 85 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2; 86 PetscInt *aj = a->j,*ai = a->i; 87 MatScalar *mat_a; 88 Mat C; 89 PetscBool flag,sorted; 90 91 PetscFunctionBegin; 92 ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr); 93 if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 94 95 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 96 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 97 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 98 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 99 100 ierr = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr); 101 ssmap = smap; 102 ierr = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr); 103 ierr = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr); 104 for (i=0; i<ncols; i++) smap[icol[i]] = i+1; 105 /* determine lens of each row */ 106 for (i=0; i<nrows; i++) { 107 kstart = ai[irow[i]]; 108 kend = kstart + a->ilen[irow[i]]; 109 lens[i] = 0; 110 for (k=kstart; k<kend; k++) { 111 if (ssmap[aj[k]]) lens[i]++; 112 } 113 } 114 /* Create and fill new matrix */ 115 if (scall == MAT_REUSE_MATRIX) { 116 c = (Mat_SeqBAIJ*)((*B)->data); 117 118 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size"); 119 ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 120 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 121 ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr); 122 C = *B; 123 } else { 124 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr); 125 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 126 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 127 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr); 128 } 129 c = (Mat_SeqBAIJ*)(C->data); 130 for (i=0; i<nrows; i++) { 131 row = irow[i]; 132 kstart = ai[row]; 133 kend = kstart + a->ilen[row]; 134 mat_i = c->i[i]; 135 mat_j = c->j + mat_i; 136 mat_a = c->a + mat_i*bs2; 137 mat_ilen = c->ilen + i; 138 for (k=kstart; k<kend; k++) { 139 if ((tcol=ssmap[a->j[k]])) { 140 *mat_j++ = tcol - 1; 141 ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 142 mat_a += bs2; 143 (*mat_ilen)++; 144 } 145 } 146 } 147 148 /* Free work space */ 149 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 150 ierr = PetscFree(smap);CHKERRQ(ierr); 151 ierr = PetscFree(lens);CHKERRQ(ierr); 152 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 153 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 154 155 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 156 *B = C; 157 PetscFunctionReturn(0); 158 } 159 160 #undef __FUNCT__ 161 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ" 162 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B) 163 { 164 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 165 IS is1,is2; 166 PetscErrorCode ierr; 167 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count; 168 const PetscInt *irow,*icol; 169 170 PetscFunctionBegin; 171 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr); 172 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr); 173 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr); 174 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr); 175 176 /* Verify if the indices corespond to each element in a block 177 and form the IS with compressed IS */ 178 ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr); 179 ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr); 180 for (i=0; i<nrows; i++) vary[irow[i]/bs]++; 181 count = 0; 182 for (i=0; i<a->mbs; i++) { 183 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks"); 184 if (vary[i]==bs) iary[count++] = i; 185 } 186 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr); 187 188 ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr); 189 for (i=0; i<ncols; i++) vary[icol[i]/bs]++; 190 count = 0; 191 for (i=0; i<a->mbs; i++) { 192 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc"); 193 if (vary[i]==bs) iary[count++] = i; 194 } 195 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr); 196 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr); 197 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr); 198 ierr = PetscFree2(vary,iary);CHKERRQ(ierr); 199 200 ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr); 201 ierr = ISDestroy(&is1);CHKERRQ(ierr); 202 ierr = ISDestroy(&is2);CHKERRQ(ierr); 203 PetscFunctionReturn(0); 204 } 205 206 #undef __FUNCT__ 207 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ" 208 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[]) 209 { 210 PetscErrorCode ierr; 211 PetscInt i; 212 213 PetscFunctionBegin; 214 if (scall == MAT_INITIAL_MATRIX) { 215 ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr); 216 } 217 218 for (i=0; i<n; i++) { 219 ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr); 220 } 221 PetscFunctionReturn(0); 222 } 223 224 225 /* -------------------------------------------------------*/ 226 /* Should check that shapes of vectors and matrices match */ 227 /* -------------------------------------------------------*/ 228 229 #if defined(PETSC_THREADCOMM_ACTIVE) 230 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 231 { 232 PetscErrorCode ierr; 233 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 234 PetscScalar *z; 235 const PetscScalar *x; 236 const MatScalar *aa; 237 PetscInt *trstarts=A->rmap->trstarts; 238 PetscInt n,start,end,i; 239 const PetscInt *aj,*ai; 240 PetscScalar sum; 241 242 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 243 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 244 start = trstarts[thread_id]; 245 end = trstarts[thread_id+1]; 246 ai = a->i; 247 for (i=start; i<end; i++) { 248 n = ai[i+1] - ai[i]; 249 aj = a->j + ai[i]; 250 aa = a->a + ai[i]; 251 sum = 0.0; 252 PetscSparseDensePlusDot(sum,x,aa,aj,n); 253 z[i] = sum; 254 } 255 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 256 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 257 return 0; 258 } 259 260 #undef __FUNCT__ 261 #define __FUNCT__ "MatMult_SeqBAIJ_1" 262 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 263 { 264 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 265 PetscScalar *z,sum; 266 const PetscScalar *x; 267 const MatScalar *v; 268 PetscErrorCode ierr; 269 PetscInt mbs,i,n; 270 const PetscInt *idx,*ii,*ridx=NULL; 271 PetscBool usecprow=a->compressedrow.use; 272 273 PetscFunctionBegin; 274 275 if (usecprow) { 276 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 277 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 278 mbs = a->compressedrow.nrows; 279 ii = a->compressedrow.i; 280 ridx = a->compressedrow.rindex; 281 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 282 for (i=0; i<mbs; i++) { 283 n = ii[i+1] - ii[i]; 284 v = a->a + ii[i]; 285 idx = a->j + ii[i]; 286 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 287 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 288 sum = 0.0; 289 PetscSparseDensePlusDot(sum,x,v,idx,n); 290 z[ridx[i]] = sum; 291 } 292 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 293 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 294 } else { 295 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr); 296 } 297 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 298 PetscFunctionReturn(0); 299 } 300 #else 301 #undef __FUNCT__ 302 #define __FUNCT__ "MatMult_SeqBAIJ_1" 303 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz) 304 { 305 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 306 PetscScalar *z,sum; 307 const PetscScalar *x; 308 const MatScalar *v; 309 PetscErrorCode ierr; 310 PetscInt mbs,i,n; 311 const PetscInt *idx,*ii,*ridx=NULL; 312 PetscBool usecprow=a->compressedrow.use; 313 314 PetscFunctionBegin; 315 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 316 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 317 318 if (usecprow) { 319 mbs = a->compressedrow.nrows; 320 ii = a->compressedrow.i; 321 ridx = a->compressedrow.rindex; 322 ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 323 } else { 324 mbs = a->mbs; 325 ii = a->i; 326 } 327 328 for (i=0; i<mbs; i++) { 329 n = ii[1] - ii[0]; 330 v = a->a + ii[0]; 331 idx = a->j + ii[0]; 332 ii++; 333 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 334 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 335 sum = 0.0; 336 PetscSparseDensePlusDot(sum,x,v,idx,n); 337 if (usecprow) { 338 z[ridx[i]] = sum; 339 } else { 340 z[i] = sum; 341 } 342 } 343 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 344 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 345 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 346 PetscFunctionReturn(0); 347 } 348 #endif 349 350 #if defined(PETSC_THREADCOMM_ACTIVE) 351 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 352 { 353 PetscErrorCode ierr; 354 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 355 PetscScalar *z,x1,x2,sum1,sum2; 356 const PetscScalar *x,*xb; 357 const MatScalar *aa; 358 PetscInt *trstarts=A->rmap->trstarts; 359 PetscInt n,start,end,i,j; 360 const PetscInt *aj,*ai; 361 362 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 363 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 364 start = trstarts[thread_id] / 2; 365 end = trstarts[thread_id+1] / 2; 366 ai = a->i; 367 for (i=start; i<end; i++) { 368 n = ai[i+1] - ai[i]; 369 aj = a->j + ai[i]; 370 aa = a->a + ai[i]*4; 371 sum1 = 0.0; sum2 = 0.0; 372 for (j=0; j<n; j++) { 373 xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1]; 374 sum1 += aa[4*j]*x1 + aa[4*j+2]*x2; 375 sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2; 376 } 377 z[2*i] = sum1; z[2*i+1] = sum2; 378 } 379 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 380 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 381 return 0; 382 } 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatMult_SeqBAIJ_2" 386 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 387 { 388 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 389 PetscScalar *z,x1,x2,sum1,sum2; 390 const PetscScalar *x,*xb; 391 const MatScalar *v; 392 PetscErrorCode ierr; 393 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 394 PetscBool usecprow=a->compressedrow.use; 395 396 PetscFunctionBegin; 397 398 if (usecprow) { 399 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 400 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 401 mbs = a->compressedrow.nrows; 402 ii = a->compressedrow.i; 403 ridx = a->compressedrow.rindex; 404 for (i=0; i<mbs; i++) { 405 n = ii[i+1] - ii[i]; 406 idx = a->j + ii[i]; 407 v = a->a + ii[i]*4; 408 sum1 = 0.0; sum2 = 0.0; 409 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 410 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 411 for (j=0; j<n; j++) { 412 xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1]; 413 sum1 += v[4*j]*x1 + v[4*j+2]*x2; 414 sum2 += v[4*j+1]*x1 + v[4*j+3]*x2; 415 } 416 z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2; 417 } 418 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 419 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 420 } else { 421 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr); 422 } 423 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 424 PetscFunctionReturn(0); 425 } 426 #else 427 #undef __FUNCT__ 428 #define __FUNCT__ "MatMult_SeqBAIJ_2" 429 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz) 430 { 431 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 432 PetscScalar *z = 0,sum1,sum2,*zarray; 433 const PetscScalar *x,*xb; 434 PetscScalar x1,x2; 435 const MatScalar *v; 436 PetscErrorCode ierr; 437 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 438 PetscBool usecprow=a->compressedrow.use; 439 440 PetscFunctionBegin; 441 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 442 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 443 444 idx = a->j; 445 v = a->a; 446 if (usecprow) { 447 mbs = a->compressedrow.nrows; 448 ii = a->compressedrow.i; 449 ridx = a->compressedrow.rindex; 450 } else { 451 mbs = a->mbs; 452 ii = a->i; 453 z = zarray; 454 } 455 456 for (i=0; i<mbs; i++) { 457 n = ii[1] - ii[0]; ii++; 458 sum1 = 0.0; sum2 = 0.0; 459 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 460 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 461 for (j=0; j<n; j++) { 462 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1]; 463 sum1 += v[0]*x1 + v[2]*x2; 464 sum2 += v[1]*x1 + v[3]*x2; 465 v += 4; 466 } 467 if (usecprow) z = zarray + 2*ridx[i]; 468 z[0] = sum1; z[1] = sum2; 469 if (!usecprow) z += 2; 470 } 471 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 472 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 473 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr); 474 PetscFunctionReturn(0); 475 } 476 #endif 477 478 #if defined(PETSC_THREADCOMM_ACTIVE) 479 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 480 { 481 PetscErrorCode ierr; 482 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 483 PetscScalar *z,x1,x2,x3,sum1,sum2,sum3; 484 const PetscScalar *x,*xb; 485 const MatScalar *aa; 486 PetscInt *trstarts=A->rmap->trstarts; 487 PetscInt n,start,end,i,j; 488 const PetscInt *aj,*ai; 489 490 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 491 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 492 start = trstarts[thread_id] / 3; 493 end = trstarts[thread_id+1] / 3; 494 ai = a->i; 495 for (i=start; i<end; i++) { 496 n = ai[i+1] - ai[i]; 497 aj = a->j + ai[i]; 498 aa = a->a + ai[i]*9; 499 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 500 for (j=0; j<n; j++) { 501 xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 502 sum1 += aa[9*j]*x1 + aa[9*j+3]*x2 + aa[9*j+6]*x3; 503 sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3; 504 sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3; 505 } 506 z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3; 507 } 508 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 509 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 510 return 0; 511 } 512 513 #undef __FUNCT__ 514 #define __FUNCT__ "MatMult_SeqBAIJ_3" 515 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 516 { 517 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 518 PetscScalar *z,sum1,sum2,sum3,x1,x2,x3; 519 const PetscScalar *x,*xb; 520 const MatScalar *v; 521 PetscErrorCode ierr; 522 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 523 PetscBool usecprow=a->compressedrow.use; 524 525 526 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 527 #pragma disjoint(*v,*z,*xb) 528 #endif 529 530 PetscFunctionBegin; 531 532 if (usecprow) { 533 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 534 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 535 mbs = a->compressedrow.nrows; 536 ii = a->compressedrow.i; 537 ridx = a->compressedrow.rindex; 538 for (i=0; i<mbs; i++) { 539 n = ii[i+1] - ii[i]; 540 idx = a->j + ii[i]; 541 v = a->a + ii[i]*9; 542 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 543 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 544 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 545 for (j=0; j<n; j++) { 546 xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 547 sum1 += v[9*j]*x1 + v[9*j+3]*x2 + v[9*j+6]*x3; 548 sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3; 549 sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3; 550 } 551 z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3; 552 } 553 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 554 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 555 } else { 556 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr); 557 } 558 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 559 PetscFunctionReturn(0); 560 } 561 #else 562 #undef __FUNCT__ 563 #define __FUNCT__ "MatMult_SeqBAIJ_3" 564 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz) 565 { 566 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 567 PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray; 568 const PetscScalar *x,*xb; 569 const MatScalar *v; 570 PetscErrorCode ierr; 571 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 572 PetscBool usecprow=a->compressedrow.use; 573 574 575 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 576 #pragma disjoint(*v,*z,*xb) 577 #endif 578 579 PetscFunctionBegin; 580 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 581 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 582 583 idx = a->j; 584 v = a->a; 585 if (usecprow) { 586 mbs = a->compressedrow.nrows; 587 ii = a->compressedrow.i; 588 ridx = a->compressedrow.rindex; 589 } else { 590 mbs = a->mbs; 591 ii = a->i; 592 z = zarray; 593 } 594 595 for (i=0; i<mbs; i++) { 596 n = ii[1] - ii[0]; ii++; 597 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; 598 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 599 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 600 for (j=0; j<n; j++) { 601 xb = x + 3*(*idx++); 602 x1 = xb[0]; 603 x2 = xb[1]; 604 x3 = xb[2]; 605 606 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 607 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 608 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 609 v += 9; 610 } 611 if (usecprow) z = zarray + 3*ridx[i]; 612 z[0] = sum1; z[1] = sum2; z[2] = sum3; 613 if (!usecprow) z += 3; 614 } 615 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 616 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 617 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr); 618 PetscFunctionReturn(0); 619 } 620 #endif 621 622 #if defined(PETSC_THREADCOMM_ACTIVE) 623 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 624 { 625 PetscErrorCode ierr; 626 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 627 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 628 const PetscScalar *x,*xb; 629 const MatScalar *aa; 630 PetscInt *trstarts=A->rmap->trstarts; 631 PetscInt n,start,end,i,j; 632 const PetscInt *aj,*ai; 633 634 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 635 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 636 start = trstarts[thread_id] / 4; 637 end = trstarts[thread_id+1] / 4; 638 ai = a->i; 639 for (i=start; i<end; i++) { 640 n = ai[i+1] - ai[i]; 641 aj = a->j + ai[i]; 642 aa = a->a + ai[i]*16; 643 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 644 for (j=0; j<n; j++) { 645 xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 646 sum1 += aa[16*j]*x1 + aa[16*j+4]*x2 + aa[16*j+8]*x3 + aa[16*j+12]*x4; 647 sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3 + aa[16*j+13]*x4; 648 sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4; 649 sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4; 650 } 651 z[4*i] = sum1; z[4*i+1] = sum2; 652 z[4*i+2] = sum3; z[4*i+3] = sum4; 653 } 654 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 655 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 656 return 0; 657 } 658 659 #undef __FUNCT__ 660 #define __FUNCT__ "MatMult_SeqBAIJ_4" 661 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 662 { 663 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 664 PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4; 665 const PetscScalar *x,*xb; 666 const MatScalar *v; 667 PetscErrorCode ierr; 668 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 669 PetscBool usecprow=a->compressedrow.use; 670 671 PetscFunctionBegin; 672 673 if (usecprow) { 674 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 675 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 676 mbs = a->compressedrow.nrows; 677 ii = a->compressedrow.i; 678 ridx = a->compressedrow.rindex; 679 for (i=0; i<mbs; i++) { 680 n = ii[i+1] - ii[1]; 681 idx = a->j + ii[i]; 682 v = a->a + ii[i]*16; 683 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; 684 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 685 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 686 for (j=0; j<n; j++) { 687 xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 688 sum1 += v[16*j]*x1 + v[16*j+4]*x2 + v[16*j+8]*x3 + v[16*j+12]*x4; 689 sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3 + v[16*j+13]*x4; 690 sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4; 691 sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4; 692 } 693 z[4*ridx[i]] = sum1; z[4*ridx[i]+1] = sum2; 694 z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4; 695 } 696 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 697 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 698 } else { 699 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr); 700 } 701 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 702 PetscFunctionReturn(0); 703 } 704 #else 705 #undef __FUNCT__ 706 #define __FUNCT__ "MatMult_SeqBAIJ_4" 707 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz) 708 { 709 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 710 PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray; 711 const PetscScalar *x,*xb; 712 const MatScalar *v; 713 PetscErrorCode ierr; 714 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 715 PetscBool usecprow=a->compressedrow.use; 716 717 PetscFunctionBegin; 718 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 719 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 720 721 idx = a->j; 722 v = a->a; 723 if (usecprow) { 724 mbs = a->compressedrow.nrows; 725 ii = a->compressedrow.i; 726 ridx = a->compressedrow.rindex; 727 } else { 728 mbs = a->mbs; 729 ii = a->i; 730 z = zarray; 731 } 732 733 for (i=0; i<mbs; i++) { 734 n = ii[1] - ii[0]; 735 ii++; 736 sum1 = 0.0; 737 sum2 = 0.0; 738 sum3 = 0.0; 739 sum4 = 0.0; 740 741 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 742 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 743 for (j=0; j<n; j++) { 744 xb = x + 4*(*idx++); 745 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 746 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 747 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 748 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 749 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 750 v += 16; 751 } 752 if (usecprow) z = zarray + 4*ridx[i]; 753 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 754 if (!usecprow) z += 4; 755 } 756 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 757 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 758 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr); 759 PetscFunctionReturn(0); 760 } 761 #endif 762 763 #if defined(PETSC_THREADCOMM_ACTIVE) 764 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 765 { 766 PetscErrorCode ierr; 767 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 768 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 769 const PetscScalar *x,*xb; 770 const MatScalar *aa; 771 PetscInt *trstarts=A->rmap->trstarts; 772 PetscInt n,start,end,i,j; 773 const PetscInt *aj,*ai; 774 775 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 776 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 777 start = trstarts[thread_id] / 5; 778 end = trstarts[thread_id+1] / 5; 779 ai = a->i; 780 for (i=start; i<end; i++) { 781 n = ai[i+1] - ai[i]; 782 aj = a->j + ai[i]; 783 aa = a->a + ai[i]*25; 784 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 785 for (j=0; j<n; j++) { 786 xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 787 sum1 += aa[25*j]*x1 + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5; 788 sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5; 789 sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5; 790 sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5; 791 sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5; 792 } 793 z[5*i] = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3; 794 z[5*i+3] = sum4; z[5*i+4] = sum5; 795 } 796 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 797 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 798 return 0; 799 } 800 801 #undef __FUNCT__ 802 #define __FUNCT__ "MatMult_SeqBAIJ_5" 803 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 804 { 805 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 806 PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5; 807 const PetscScalar *xb,*x; 808 const MatScalar *v; 809 PetscErrorCode ierr; 810 const PetscInt *idx,*ii,*ridx=NULL; 811 PetscInt mbs,i,j,n; 812 PetscBool usecprow=a->compressedrow.use; 813 814 PetscFunctionBegin; 815 816 if (usecprow) { 817 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 818 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 819 mbs = a->compressedrow.nrows; 820 ii = a->compressedrow.i; 821 ridx = a->compressedrow.rindex; 822 for (i=0; i<mbs; i++) { 823 n = ii[i+1] - ii[i]; 824 idx = a->j + ii[i]; 825 v = a->a + ii[i]*25; 826 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 827 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 828 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 829 for (j=0; j<n; j++) { 830 xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 831 sum1 += v[25*j]*x1 + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5; 832 sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5; 833 sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5; 834 sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5; 835 sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5; 836 } 837 z[5*ridx[i]] = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3; 838 z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5; 839 } 840 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 841 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 842 } else { 843 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr); 844 } 845 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 846 PetscFunctionReturn(0); 847 } 848 #else 849 #undef __FUNCT__ 850 #define __FUNCT__ "MatMult_SeqBAIJ_5" 851 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz) 852 { 853 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 854 PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray; 855 const PetscScalar *xb,*x; 856 const MatScalar *v; 857 PetscErrorCode ierr; 858 const PetscInt *idx,*ii,*ridx=NULL; 859 PetscInt mbs,i,j,n; 860 PetscBool usecprow=a->compressedrow.use; 861 862 PetscFunctionBegin; 863 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 864 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 865 866 idx = a->j; 867 v = a->a; 868 if (usecprow) { 869 mbs = a->compressedrow.nrows; 870 ii = a->compressedrow.i; 871 ridx = a->compressedrow.rindex; 872 } else { 873 mbs = a->mbs; 874 ii = a->i; 875 z = zarray; 876 } 877 878 for (i=0; i<mbs; i++) { 879 n = ii[1] - ii[0]; ii++; 880 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; 881 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 882 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 883 for (j=0; j<n; j++) { 884 xb = x + 5*(*idx++); 885 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 886 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 887 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 888 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 889 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 890 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 891 v += 25; 892 } 893 if (usecprow) z = zarray + 5*ridx[i]; 894 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 895 if (!usecprow) z += 5; 896 } 897 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 898 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 899 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr); 900 PetscFunctionReturn(0); 901 } 902 #endif 903 904 905 #if defined(PETSC_THREADCOMM_ACTIVE) 906 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 907 { 908 PetscErrorCode ierr; 909 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 910 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 911 const PetscScalar *x,*xb; 912 const MatScalar *aa; 913 PetscInt *trstarts=A->rmap->trstarts; 914 PetscInt n,start,end,i,j; 915 const PetscInt *aj,*ai; 916 917 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 918 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 919 start = trstarts[thread_id] / 6; 920 end = trstarts[thread_id+1] / 6; 921 ai = a->i; 922 for (i=start; i<end; i++) { 923 n = ai[i+1] - ai[i]; 924 aj = a->j + ai[i]; 925 aa = a->a + ai[i]*36; 926 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 927 for (j=0; j<n; j++) { 928 xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 929 sum1 += aa[36*j]*x1 + aa[36*j+6]*x2 + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6; 930 sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2 + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6; 931 sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2 + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6; 932 sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2 + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6; 933 sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6; 934 sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6; 935 } 936 z[6*i] = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3; 937 z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6; 938 } 939 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 940 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 941 return 0; 942 } 943 944 #undef __FUNCT__ 945 #define __FUNCT__ "MatMult_SeqBAIJ_6" 946 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 947 { 948 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 949 PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6; 950 const PetscScalar *x,*xb; 951 const MatScalar *v; 952 PetscErrorCode ierr; 953 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 954 PetscBool usecprow=a->compressedrow.use; 955 956 PetscFunctionBegin; 957 958 if (usecprow) { 959 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 960 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 961 mbs = a->compressedrow.nrows; 962 ii = a->compressedrow.i; 963 ridx = a->compressedrow.rindex; 964 for (i=0; i<mbs; i++) { 965 n = ii[i+1] - ii[i]; 966 idx = a->j + ii[i]; 967 v = a->a + ii[i]*36; 968 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; 969 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 970 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 971 for (j=0; j<n; j++) { 972 xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 973 sum1 += v[36*j]*x1 + v[36*j+6]*x2 + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6; 974 sum2 += v[36*j+1]*x1 + v[36*j+7]*x2 + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6; 975 sum3 += v[36*j+2]*x1 + v[36*j+8]*x2 + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6; 976 sum4 += v[36*j+3]*x1 + v[36*j+9]*x2 + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6; 977 sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6; 978 sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6; 979 } 980 z[6*ridx[i]] = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3; 981 z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6; 982 } 983 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 984 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 985 } else { 986 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr); 987 } 988 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 989 PetscFunctionReturn(0); 990 } 991 #else 992 #undef __FUNCT__ 993 #define __FUNCT__ "MatMult_SeqBAIJ_6" 994 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz) 995 { 996 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 997 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6; 998 const PetscScalar *x,*xb; 999 PetscScalar x1,x2,x3,x4,x5,x6,*zarray; 1000 const MatScalar *v; 1001 PetscErrorCode ierr; 1002 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1003 PetscBool usecprow=a->compressedrow.use; 1004 1005 PetscFunctionBegin; 1006 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1007 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1008 1009 idx = a->j; 1010 v = a->a; 1011 if (usecprow) { 1012 mbs = a->compressedrow.nrows; 1013 ii = a->compressedrow.i; 1014 ridx = a->compressedrow.rindex; 1015 } else { 1016 mbs = a->mbs; 1017 ii = a->i; 1018 z = zarray; 1019 } 1020 1021 for (i=0; i<mbs; i++) { 1022 n = ii[1] - ii[0]; 1023 ii++; 1024 sum1 = 0.0; 1025 sum2 = 0.0; 1026 sum3 = 0.0; 1027 sum4 = 0.0; 1028 sum5 = 0.0; 1029 sum6 = 0.0; 1030 1031 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1032 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1033 for (j=0; j<n; j++) { 1034 xb = x + 6*(*idx++); 1035 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 1036 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1037 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1038 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1039 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1040 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1041 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1042 v += 36; 1043 } 1044 if (usecprow) z = zarray + 6*ridx[i]; 1045 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 1046 if (!usecprow) z += 6; 1047 } 1048 1049 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1050 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1051 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr); 1052 PetscFunctionReturn(0); 1053 } 1054 #endif 1055 1056 #if defined(PETSC_THREADCOMM_ACTIVE) 1057 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz) 1058 { 1059 PetscErrorCode ierr; 1060 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1061 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1062 const PetscScalar *x,*xb; 1063 const MatScalar *aa; 1064 PetscInt *trstarts=A->rmap->trstarts; 1065 PetscInt n,start,end,i,j; 1066 const PetscInt *aj,*ai; 1067 1068 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1069 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1070 start = trstarts[thread_id] / 7; 1071 end = trstarts[thread_id+1] / 7; 1072 ai = a->i; 1073 for (i=start; i<end; i++) { 1074 n = ai[i+1] - ai[i]; 1075 aj = a->j + ai[i]; 1076 aa = a->a + ai[i]*49; 1077 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1078 for (j=0; j<n; j++) { 1079 xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1080 sum1 += aa[49*j]*x1 + aa[49*j+7]*x2 + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7; 1081 sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2 + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7; 1082 sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2 + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7; 1083 sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7; 1084 sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7; 1085 sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7; 1086 sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7; 1087 } 1088 z[7*i] = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4; 1089 z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7; 1090 } 1091 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1092 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1093 return 0; 1094 } 1095 1096 #undef __FUNCT__ 1097 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1098 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1099 { 1100 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1101 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1102 const PetscScalar *x,*xb; 1103 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1104 const MatScalar *v; 1105 PetscErrorCode ierr; 1106 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1107 PetscBool usecprow=a->compressedrow.use; 1108 1109 PetscFunctionBegin; 1110 1111 if (usecprow) { 1112 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1113 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1114 mbs = a->compressedrow.nrows; 1115 ii = a->compressedrow.i; 1116 ridx = a->compressedrow.rindex; 1117 for (i=0; i<mbs; i++) { 1118 n = ii[i+1] - ii[i]; 1119 idx = a->j + ii[i]; 1120 v = a->a + ii[i]*49; 1121 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1122 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1123 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1124 for (j=0; j<n; j++) { 1125 xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1126 sum1 += v[49*j]*x1 + v[49*j+7]*x2 + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7; 1127 sum2 += v[49*j+1]*x1 + v[49*j+8]*x2 + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7; 1128 sum3 += v[49*j+2]*x1 + v[49*j+9]*x2 + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7; 1129 sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7; 1130 sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7; 1131 sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7; 1132 sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7; 1133 } 1134 z[7*ridx[i]] = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4; 1135 z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7; 1136 } 1137 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1138 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1139 } else { 1140 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr); 1141 } 1142 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1143 PetscFunctionReturn(0); 1144 } 1145 #else 1146 #undef __FUNCT__ 1147 #define __FUNCT__ "MatMult_SeqBAIJ_7" 1148 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz) 1149 { 1150 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1151 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 1152 const PetscScalar *x,*xb; 1153 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray; 1154 const MatScalar *v; 1155 PetscErrorCode ierr; 1156 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL; 1157 PetscBool usecprow=a->compressedrow.use; 1158 1159 PetscFunctionBegin; 1160 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1161 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1162 1163 idx = a->j; 1164 v = a->a; 1165 if (usecprow) { 1166 mbs = a->compressedrow.nrows; 1167 ii = a->compressedrow.i; 1168 ridx = a->compressedrow.rindex; 1169 } else { 1170 mbs = a->mbs; 1171 ii = a->i; 1172 z = zarray; 1173 } 1174 1175 for (i=0; i<mbs; i++) { 1176 n = ii[1] - ii[0]; 1177 ii++; 1178 sum1 = 0.0; 1179 sum2 = 0.0; 1180 sum3 = 0.0; 1181 sum4 = 0.0; 1182 sum5 = 0.0; 1183 sum6 = 0.0; 1184 sum7 = 0.0; 1185 1186 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1187 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1188 for (j=0; j<n; j++) { 1189 xb = x + 7*(*idx++); 1190 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1191 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1192 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1193 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1194 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1195 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1196 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1197 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1198 v += 49; 1199 } 1200 if (usecprow) z = zarray + 7*ridx[i]; 1201 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1202 if (!usecprow) z += 7; 1203 } 1204 1205 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1206 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1207 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr); 1208 PetscFunctionReturn(0); 1209 } 1210 #endif 1211 1212 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */ 1213 /* Default MatMult for block size 15 */ 1214 1215 #undef __FUNCT__ 1216 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1" 1217 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz) 1218 { 1219 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1220 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1221 const PetscScalar *x,*xb; 1222 PetscScalar *zarray,xv; 1223 const MatScalar *v; 1224 PetscErrorCode ierr; 1225 const PetscInt *ii,*ij=a->j,*idx; 1226 PetscInt mbs,i,j,k,n,*ridx=NULL; 1227 PetscBool usecprow=a->compressedrow.use; 1228 1229 PetscFunctionBegin; 1230 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1231 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1232 1233 v = a->a; 1234 if (usecprow) { 1235 mbs = a->compressedrow.nrows; 1236 ii = a->compressedrow.i; 1237 ridx = a->compressedrow.rindex; 1238 } else { 1239 mbs = a->mbs; 1240 ii = a->i; 1241 z = zarray; 1242 } 1243 1244 for (i=0; i<mbs; i++) { 1245 n = ii[i+1] - ii[i]; 1246 idx = ij + ii[i]; 1247 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1248 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1249 1250 for (j=0; j<n; j++) { 1251 xb = x + 15*(idx[j]); 1252 1253 for (k=0; k<15; k++) { 1254 xv = xb[k]; 1255 sum1 += v[0]*xv; 1256 sum2 += v[1]*xv; 1257 sum3 += v[2]*xv; 1258 sum4 += v[3]*xv; 1259 sum5 += v[4]*xv; 1260 sum6 += v[5]*xv; 1261 sum7 += v[6]*xv; 1262 sum8 += v[7]*xv; 1263 sum9 += v[8]*xv; 1264 sum10 += v[9]*xv; 1265 sum11 += v[10]*xv; 1266 sum12 += v[11]*xv; 1267 sum13 += v[12]*xv; 1268 sum14 += v[13]*xv; 1269 sum15 += v[14]*xv; 1270 v += 15; 1271 } 1272 } 1273 if (usecprow) z = zarray + 15*ridx[i]; 1274 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1275 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1276 1277 if (!usecprow) z += 15; 1278 } 1279 1280 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1281 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1282 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1283 PetscFunctionReturn(0); 1284 } 1285 1286 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */ 1287 #undef __FUNCT__ 1288 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2" 1289 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz) 1290 { 1291 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1292 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1293 const PetscScalar *x,*xb; 1294 PetscScalar x1,x2,x3,x4,*zarray; 1295 const MatScalar *v; 1296 PetscErrorCode ierr; 1297 const PetscInt *ii,*ij=a->j,*idx; 1298 PetscInt mbs,i,j,n,*ridx=NULL; 1299 PetscBool usecprow=a->compressedrow.use; 1300 1301 PetscFunctionBegin; 1302 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1303 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1304 1305 v = a->a; 1306 if (usecprow) { 1307 mbs = a->compressedrow.nrows; 1308 ii = a->compressedrow.i; 1309 ridx = a->compressedrow.rindex; 1310 } else { 1311 mbs = a->mbs; 1312 ii = a->i; 1313 z = zarray; 1314 } 1315 1316 for (i=0; i<mbs; i++) { 1317 n = ii[i+1] - ii[i]; 1318 idx = ij + ii[i]; 1319 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1320 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1321 1322 for (j=0; j<n; j++) { 1323 xb = x + 15*(idx[j]); 1324 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1325 1326 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1327 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1328 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1329 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1330 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1331 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1332 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1333 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1334 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1335 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1336 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1337 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1338 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1339 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1340 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1341 1342 v += 60; 1343 1344 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7]; 1345 1346 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1347 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1348 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1349 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1350 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1351 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1352 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1353 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1354 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1355 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1356 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1357 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1358 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1359 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1360 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1361 v += 60; 1362 1363 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; 1364 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4; 1365 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4; 1366 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4; 1367 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4; 1368 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4; 1369 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4; 1370 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4; 1371 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4; 1372 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4; 1373 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4; 1374 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4; 1375 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4; 1376 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4; 1377 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4; 1378 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4; 1379 v += 60; 1380 1381 x1 = xb[12]; x2 = xb[13]; x3 = xb[14]; 1382 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3; 1383 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3; 1384 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3; 1385 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3; 1386 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3; 1387 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3; 1388 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3; 1389 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3; 1390 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3; 1391 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3; 1392 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3; 1393 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3; 1394 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3; 1395 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3; 1396 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3; 1397 v += 45; 1398 } 1399 if (usecprow) z = zarray + 15*ridx[i]; 1400 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1401 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1402 1403 if (!usecprow) z += 15; 1404 } 1405 1406 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1407 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1408 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1409 PetscFunctionReturn(0); 1410 } 1411 1412 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */ 1413 #undef __FUNCT__ 1414 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3" 1415 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz) 1416 { 1417 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1418 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1419 const PetscScalar *x,*xb; 1420 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray; 1421 const MatScalar *v; 1422 PetscErrorCode ierr; 1423 const PetscInt *ii,*ij=a->j,*idx; 1424 PetscInt mbs,i,j,n,*ridx=NULL; 1425 PetscBool usecprow=a->compressedrow.use; 1426 1427 PetscFunctionBegin; 1428 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1429 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1430 1431 v = a->a; 1432 if (usecprow) { 1433 mbs = a->compressedrow.nrows; 1434 ii = a->compressedrow.i; 1435 ridx = a->compressedrow.rindex; 1436 } else { 1437 mbs = a->mbs; 1438 ii = a->i; 1439 z = zarray; 1440 } 1441 1442 for (i=0; i<mbs; i++) { 1443 n = ii[i+1] - ii[i]; 1444 idx = ij + ii[i]; 1445 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1446 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1447 1448 for (j=0; j<n; j++) { 1449 xb = x + 15*(idx[j]); 1450 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1451 x8 = xb[7]; 1452 1453 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8; 1454 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8; 1455 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8; 1456 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8; 1457 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8; 1458 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8; 1459 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8; 1460 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8; 1461 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8; 1462 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8; 1463 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8; 1464 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8; 1465 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8; 1466 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8; 1467 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8; 1468 v += 120; 1469 1470 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14]; 1471 1472 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7; 1473 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7; 1474 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7; 1475 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7; 1476 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7; 1477 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7; 1478 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7; 1479 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7; 1480 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7; 1481 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7; 1482 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7; 1483 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7; 1484 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7; 1485 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7; 1486 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7; 1487 v += 105; 1488 } 1489 if (usecprow) z = zarray + 15*ridx[i]; 1490 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1491 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1492 1493 if (!usecprow) z += 15; 1494 } 1495 1496 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1497 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1498 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1499 PetscFunctionReturn(0); 1500 } 1501 1502 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */ 1503 1504 #undef __FUNCT__ 1505 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4" 1506 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz) 1507 { 1508 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1509 PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15; 1510 const PetscScalar *x,*xb; 1511 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray; 1512 const MatScalar *v; 1513 PetscErrorCode ierr; 1514 const PetscInt *ii,*ij=a->j,*idx; 1515 PetscInt mbs,i,j,n,*ridx=NULL; 1516 PetscBool usecprow=a->compressedrow.use; 1517 1518 PetscFunctionBegin; 1519 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1520 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1521 1522 v = a->a; 1523 if (usecprow) { 1524 mbs = a->compressedrow.nrows; 1525 ii = a->compressedrow.i; 1526 ridx = a->compressedrow.rindex; 1527 } else { 1528 mbs = a->mbs; 1529 ii = a->i; 1530 z = zarray; 1531 } 1532 1533 for (i=0; i<mbs; i++) { 1534 n = ii[i+1] - ii[i]; 1535 idx = ij + ii[i]; 1536 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0; 1537 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0; 1538 1539 for (j=0; j<n; j++) { 1540 xb = x + 15*(idx[j]); 1541 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 1542 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14]; 1543 1544 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 1545 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 1546 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 1547 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 1548 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 1549 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 1550 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 1551 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 1552 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 1553 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 1554 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 1555 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 1556 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 1557 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 1558 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 1559 v += 225; 1560 } 1561 if (usecprow) z = zarray + 15*ridx[i]; 1562 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 1563 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15; 1564 1565 if (!usecprow) z += 15; 1566 } 1567 1568 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1569 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1570 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr); 1571 PetscFunctionReturn(0); 1572 } 1573 1574 1575 /* 1576 This will not work with MatScalar == float because it calls the BLAS 1577 */ 1578 #undef __FUNCT__ 1579 #define __FUNCT__ "MatMult_SeqBAIJ_N" 1580 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz) 1581 { 1582 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1583 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 1584 MatScalar *v; 1585 PetscErrorCode ierr; 1586 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 1587 PetscInt ncols,k,*ridx=NULL; 1588 PetscBool usecprow=a->compressedrow.use; 1589 1590 PetscFunctionBegin; 1591 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1592 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1593 1594 idx = a->j; 1595 v = a->a; 1596 if (usecprow) { 1597 mbs = a->compressedrow.nrows; 1598 ii = a->compressedrow.i; 1599 ridx = a->compressedrow.rindex; 1600 } else { 1601 mbs = a->mbs; 1602 ii = a->i; 1603 z = zarray; 1604 } 1605 1606 if (!a->mult_work) { 1607 k = PetscMax(A->rmap->n,A->cmap->n); 1608 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 1609 } 1610 work = a->mult_work; 1611 for (i=0; i<mbs; i++) { 1612 n = ii[1] - ii[0]; ii++; 1613 ncols = n*bs; 1614 workt = work; 1615 for (j=0; j<n; j++) { 1616 xb = x + bs*(*idx++); 1617 for (k=0; k<bs; k++) workt[k] = xb[k]; 1618 workt += bs; 1619 } 1620 if (usecprow) z = zarray + bs*ridx[i]; 1621 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z); 1622 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */ 1623 v += n*bs2; 1624 if (!usecprow) z += bs; 1625 } 1626 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1627 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1628 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr); 1629 PetscFunctionReturn(0); 1630 } 1631 1632 #undef __FUNCT__ 1633 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1" 1634 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz) 1635 { 1636 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1637 const PetscScalar *x; 1638 PetscScalar *y,*z,sum; 1639 const MatScalar *v; 1640 PetscErrorCode ierr; 1641 PetscInt mbs=a->mbs,i,n,*ridx=NULL; 1642 const PetscInt *idx,*ii; 1643 PetscBool usecprow=a->compressedrow.use; 1644 1645 PetscFunctionBegin; 1646 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 1647 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 1648 if (zz != yy) { 1649 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 1650 } else { 1651 z = y; 1652 } 1653 1654 idx = a->j; 1655 v = a->a; 1656 if (usecprow) { 1657 if (zz != yy) { 1658 ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1659 } 1660 mbs = a->compressedrow.nrows; 1661 ii = a->compressedrow.i; 1662 ridx = a->compressedrow.rindex; 1663 } else { 1664 ii = a->i; 1665 } 1666 1667 for (i=0; i<mbs; i++) { 1668 n = ii[1] - ii[0]; 1669 ii++; 1670 if (!usecprow) { 1671 sum = y[i]; 1672 } else { 1673 sum = y[ridx[i]]; 1674 } 1675 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1676 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1677 PetscSparseDensePlusDot(sum,x,v,idx,n); 1678 v += n; 1679 idx += n; 1680 if (usecprow) { 1681 z[ridx[i]] = sum; 1682 } else { 1683 z[i] = sum; 1684 } 1685 } 1686 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 1687 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 1688 if (zz != yy) { 1689 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 1690 } 1691 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1692 PetscFunctionReturn(0); 1693 } 1694 1695 #undef __FUNCT__ 1696 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2" 1697 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz) 1698 { 1699 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1700 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2; 1701 PetscScalar x1,x2,*yarray,*zarray; 1702 MatScalar *v; 1703 PetscErrorCode ierr; 1704 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1705 PetscBool usecprow=a->compressedrow.use; 1706 1707 PetscFunctionBegin; 1708 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1709 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1710 if (zz != yy) { 1711 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1712 } else { 1713 zarray = yarray; 1714 } 1715 1716 idx = a->j; 1717 v = a->a; 1718 if (usecprow) { 1719 if (zz != yy) { 1720 ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1721 } 1722 mbs = a->compressedrow.nrows; 1723 ii = a->compressedrow.i; 1724 ridx = a->compressedrow.rindex; 1725 if (zz != yy) { 1726 ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1727 } 1728 } else { 1729 ii = a->i; 1730 y = yarray; 1731 z = zarray; 1732 } 1733 1734 for (i=0; i<mbs; i++) { 1735 n = ii[1] - ii[0]; ii++; 1736 if (usecprow) { 1737 z = zarray + 2*ridx[i]; 1738 y = yarray + 2*ridx[i]; 1739 } 1740 sum1 = y[0]; sum2 = y[1]; 1741 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1742 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1743 for (j=0; j<n; j++) { 1744 xb = x + 2*(*idx++); 1745 x1 = xb[0]; 1746 x2 = xb[1]; 1747 1748 sum1 += v[0]*x1 + v[2]*x2; 1749 sum2 += v[1]*x1 + v[3]*x2; 1750 v += 4; 1751 } 1752 z[0] = sum1; z[1] = sum2; 1753 if (!usecprow) { 1754 z += 2; y += 2; 1755 } 1756 } 1757 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1758 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1759 if (zz != yy) { 1760 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1761 } 1762 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 1763 PetscFunctionReturn(0); 1764 } 1765 1766 #undef __FUNCT__ 1767 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3" 1768 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz) 1769 { 1770 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1771 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray; 1772 MatScalar *v; 1773 PetscErrorCode ierr; 1774 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1775 PetscBool usecprow=a->compressedrow.use; 1776 1777 PetscFunctionBegin; 1778 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1779 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1780 if (zz != yy) { 1781 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1782 } else { 1783 zarray = yarray; 1784 } 1785 1786 idx = a->j; 1787 v = a->a; 1788 if (usecprow) { 1789 if (zz != yy) { 1790 ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1791 } 1792 mbs = a->compressedrow.nrows; 1793 ii = a->compressedrow.i; 1794 ridx = a->compressedrow.rindex; 1795 } else { 1796 ii = a->i; 1797 y = yarray; 1798 z = zarray; 1799 } 1800 1801 for (i=0; i<mbs; i++) { 1802 n = ii[1] - ii[0]; ii++; 1803 if (usecprow) { 1804 z = zarray + 3*ridx[i]; 1805 y = yarray + 3*ridx[i]; 1806 } 1807 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; 1808 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1809 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1810 for (j=0; j<n; j++) { 1811 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 1812 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3; 1813 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3; 1814 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3; 1815 v += 9; 1816 } 1817 z[0] = sum1; z[1] = sum2; z[2] = sum3; 1818 if (!usecprow) { 1819 z += 3; y += 3; 1820 } 1821 } 1822 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1823 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1824 if (zz != yy) { 1825 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1826 } 1827 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr); 1828 PetscFunctionReturn(0); 1829 } 1830 1831 #undef __FUNCT__ 1832 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4" 1833 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz) 1834 { 1835 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1836 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray; 1837 MatScalar *v; 1838 PetscErrorCode ierr; 1839 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1840 PetscBool usecprow=a->compressedrow.use; 1841 1842 PetscFunctionBegin; 1843 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1844 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1845 if (zz != yy) { 1846 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1847 } else { 1848 zarray = yarray; 1849 } 1850 1851 idx = a->j; 1852 v = a->a; 1853 if (usecprow) { 1854 if (zz != yy) { 1855 ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1856 } 1857 mbs = a->compressedrow.nrows; 1858 ii = a->compressedrow.i; 1859 ridx = a->compressedrow.rindex; 1860 } else { 1861 ii = a->i; 1862 y = yarray; 1863 z = zarray; 1864 } 1865 1866 for (i=0; i<mbs; i++) { 1867 n = ii[1] - ii[0]; ii++; 1868 if (usecprow) { 1869 z = zarray + 4*ridx[i]; 1870 y = yarray + 4*ridx[i]; 1871 } 1872 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; 1873 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1874 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1875 for (j=0; j<n; j++) { 1876 xb = x + 4*(*idx++); 1877 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 1878 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1879 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1880 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1881 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1882 v += 16; 1883 } 1884 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; 1885 if (!usecprow) { 1886 z += 4; y += 4; 1887 } 1888 } 1889 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1890 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1891 if (zz != yy) { 1892 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1893 } 1894 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr); 1895 PetscFunctionReturn(0); 1896 } 1897 1898 #undef __FUNCT__ 1899 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5" 1900 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz) 1901 { 1902 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1903 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5; 1904 PetscScalar *yarray,*zarray; 1905 MatScalar *v; 1906 PetscErrorCode ierr; 1907 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1908 PetscBool usecprow=a->compressedrow.use; 1909 1910 PetscFunctionBegin; 1911 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1912 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1913 if (zz != yy) { 1914 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1915 } else { 1916 zarray = yarray; 1917 } 1918 1919 idx = a->j; 1920 v = a->a; 1921 if (usecprow) { 1922 if (zz != yy) { 1923 ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1924 } 1925 mbs = a->compressedrow.nrows; 1926 ii = a->compressedrow.i; 1927 ridx = a->compressedrow.rindex; 1928 } else { 1929 ii = a->i; 1930 y = yarray; 1931 z = zarray; 1932 } 1933 1934 for (i=0; i<mbs; i++) { 1935 n = ii[1] - ii[0]; ii++; 1936 if (usecprow) { 1937 z = zarray + 5*ridx[i]; 1938 y = yarray + 5*ridx[i]; 1939 } 1940 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; 1941 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1942 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1943 for (j=0; j<n; j++) { 1944 xb = x + 5*(*idx++); 1945 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; 1946 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1947 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1948 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1949 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1950 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1951 v += 25; 1952 } 1953 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; 1954 if (!usecprow) { 1955 z += 5; y += 5; 1956 } 1957 } 1958 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1959 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 1960 if (zz != yy) { 1961 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 1962 } 1963 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr); 1964 PetscFunctionReturn(0); 1965 } 1966 #undef __FUNCT__ 1967 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6" 1968 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz) 1969 { 1970 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1971 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6; 1972 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray; 1973 MatScalar *v; 1974 PetscErrorCode ierr; 1975 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 1976 PetscBool usecprow=a->compressedrow.use; 1977 1978 PetscFunctionBegin; 1979 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1980 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 1981 if (zz != yy) { 1982 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 1983 } else { 1984 zarray = yarray; 1985 } 1986 1987 idx = a->j; 1988 v = a->a; 1989 if (usecprow) { 1990 if (zz != yy) { 1991 ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1992 } 1993 mbs = a->compressedrow.nrows; 1994 ii = a->compressedrow.i; 1995 ridx = a->compressedrow.rindex; 1996 } else { 1997 ii = a->i; 1998 y = yarray; 1999 z = zarray; 2000 } 2001 2002 for (i=0; i<mbs; i++) { 2003 n = ii[1] - ii[0]; ii++; 2004 if (usecprow) { 2005 z = zarray + 6*ridx[i]; 2006 y = yarray + 6*ridx[i]; 2007 } 2008 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; 2009 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2010 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2011 for (j=0; j<n; j++) { 2012 xb = x + 6*(*idx++); 2013 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; 2014 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2015 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2016 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2017 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2018 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2019 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2020 v += 36; 2021 } 2022 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; 2023 if (!usecprow) { 2024 z += 6; y += 6; 2025 } 2026 } 2027 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2028 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 2029 if (zz != yy) { 2030 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2031 } 2032 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr); 2033 PetscFunctionReturn(0); 2034 } 2035 2036 #undef __FUNCT__ 2037 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7" 2038 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz) 2039 { 2040 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2041 PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7; 2042 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray; 2043 MatScalar *v; 2044 PetscErrorCode ierr; 2045 PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL; 2046 PetscBool usecprow=a->compressedrow.use; 2047 2048 PetscFunctionBegin; 2049 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2050 ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr); 2051 if (zz != yy) { 2052 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2053 } else { 2054 zarray = yarray; 2055 } 2056 2057 idx = a->j; 2058 v = a->a; 2059 if (usecprow) { 2060 if (zz != yy) { 2061 ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2062 } 2063 mbs = a->compressedrow.nrows; 2064 ii = a->compressedrow.i; 2065 ridx = a->compressedrow.rindex; 2066 } else { 2067 ii = a->i; 2068 y = yarray; 2069 z = zarray; 2070 } 2071 2072 for (i=0; i<mbs; i++) { 2073 n = ii[1] - ii[0]; ii++; 2074 if (usecprow) { 2075 z = zarray + 7*ridx[i]; 2076 y = yarray + 7*ridx[i]; 2077 } 2078 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6]; 2079 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 2080 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 2081 for (j=0; j<n; j++) { 2082 xb = x + 7*(*idx++); 2083 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6]; 2084 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2085 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2086 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2087 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2088 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2089 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2090 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2091 v += 49; 2092 } 2093 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7; 2094 if (!usecprow) { 2095 z += 7; y += 7; 2096 } 2097 } 2098 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2099 ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr); 2100 if (zz != yy) { 2101 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2102 } 2103 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr); 2104 PetscFunctionReturn(0); 2105 } 2106 2107 #undef __FUNCT__ 2108 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N" 2109 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz) 2110 { 2111 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2112 PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray; 2113 MatScalar *v; 2114 PetscErrorCode ierr; 2115 PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2; 2116 PetscInt ncols,k,*ridx=NULL; 2117 PetscBool usecprow=a->compressedrow.use; 2118 2119 PetscFunctionBegin; 2120 ierr = VecCopy(yy,zz);CHKERRQ(ierr); 2121 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2122 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr); 2123 2124 idx = a->j; 2125 v = a->a; 2126 if (usecprow) { 2127 mbs = a->compressedrow.nrows; 2128 ii = a->compressedrow.i; 2129 ridx = a->compressedrow.rindex; 2130 } else { 2131 mbs = a->mbs; 2132 ii = a->i; 2133 z = zarray; 2134 } 2135 2136 if (!a->mult_work) { 2137 k = PetscMax(A->rmap->n,A->cmap->n); 2138 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 2139 } 2140 work = a->mult_work; 2141 for (i=0; i<mbs; i++) { 2142 n = ii[1] - ii[0]; ii++; 2143 ncols = n*bs; 2144 workt = work; 2145 for (j=0; j<n; j++) { 2146 xb = x + bs*(*idx++); 2147 for (k=0; k<bs; k++) workt[k] = xb[k]; 2148 workt += bs; 2149 } 2150 if (usecprow) z = zarray + bs*ridx[i]; 2151 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z); 2152 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */ 2153 v += n*bs2; 2154 if (!usecprow) z += bs; 2155 } 2156 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2157 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr); 2158 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr); 2159 PetscFunctionReturn(0); 2160 } 2161 2162 #undef __FUNCT__ 2163 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ" 2164 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2165 { 2166 PetscScalar zero = 0.0; 2167 PetscErrorCode ierr; 2168 2169 PetscFunctionBegin; 2170 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2171 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2172 PetscFunctionReturn(0); 2173 } 2174 2175 #undef __FUNCT__ 2176 #define __FUNCT__ "MatMultTranspose_SeqBAIJ" 2177 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz) 2178 { 2179 PetscScalar zero = 0.0; 2180 PetscErrorCode ierr; 2181 2182 PetscFunctionBegin; 2183 ierr = VecSet(zz,zero);CHKERRQ(ierr); 2184 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr); 2185 PetscFunctionReturn(0); 2186 } 2187 2188 #undef __FUNCT__ 2189 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ" 2190 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2191 { 2192 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2193 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 2194 MatScalar *v; 2195 PetscErrorCode ierr; 2196 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL; 2197 Mat_CompressedRow cprow = a->compressedrow; 2198 PetscBool usecprow=cprow.use; 2199 2200 PetscFunctionBegin; 2201 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2202 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2203 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2204 2205 idx = a->j; 2206 v = a->a; 2207 if (usecprow) { 2208 mbs = cprow.nrows; 2209 ii = cprow.i; 2210 ridx = cprow.rindex; 2211 } else { 2212 mbs=a->mbs; 2213 ii = a->i; 2214 xb = x; 2215 } 2216 2217 switch (bs) { 2218 case 1: 2219 for (i=0; i<mbs; i++) { 2220 if (usecprow) xb = x + ridx[i]; 2221 x1 = xb[0]; 2222 ib = idx + ii[0]; 2223 n = ii[1] - ii[0]; ii++; 2224 for (j=0; j<n; j++) { 2225 rval = ib[j]; 2226 z[rval] += PetscConj(*v) * x1; 2227 v++; 2228 } 2229 if (!usecprow) xb++; 2230 } 2231 break; 2232 case 2: 2233 for (i=0; i<mbs; i++) { 2234 if (usecprow) xb = x + 2*ridx[i]; 2235 x1 = xb[0]; x2 = xb[1]; 2236 ib = idx + ii[0]; 2237 n = ii[1] - ii[0]; ii++; 2238 for (j=0; j<n; j++) { 2239 rval = ib[j]*2; 2240 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2; 2241 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2; 2242 v += 4; 2243 } 2244 if (!usecprow) xb += 2; 2245 } 2246 break; 2247 case 3: 2248 for (i=0; i<mbs; i++) { 2249 if (usecprow) xb = x + 3*ridx[i]; 2250 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2251 ib = idx + ii[0]; 2252 n = ii[1] - ii[0]; ii++; 2253 for (j=0; j<n; j++) { 2254 rval = ib[j]*3; 2255 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3; 2256 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3; 2257 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3; 2258 v += 9; 2259 } 2260 if (!usecprow) xb += 3; 2261 } 2262 break; 2263 case 4: 2264 for (i=0; i<mbs; i++) { 2265 if (usecprow) xb = x + 4*ridx[i]; 2266 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2267 ib = idx + ii[0]; 2268 n = ii[1] - ii[0]; ii++; 2269 for (j=0; j<n; j++) { 2270 rval = ib[j]*4; 2271 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4; 2272 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4; 2273 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4; 2274 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4; 2275 v += 16; 2276 } 2277 if (!usecprow) xb += 4; 2278 } 2279 break; 2280 case 5: 2281 for (i=0; i<mbs; i++) { 2282 if (usecprow) xb = x + 5*ridx[i]; 2283 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2284 x4 = xb[3]; x5 = xb[4]; 2285 ib = idx + ii[0]; 2286 n = ii[1] - ii[0]; ii++; 2287 for (j=0; j<n; j++) { 2288 rval = ib[j]*5; 2289 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5; 2290 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5; 2291 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5; 2292 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5; 2293 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5; 2294 v += 25; 2295 } 2296 if (!usecprow) xb += 5; 2297 } 2298 break; 2299 default: { /* block sizes larger than 5 by 5 are handled by BLAS */ 2300 PetscInt ncols,k; 2301 PetscScalar *work,*workt,*xtmp; 2302 2303 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet"); 2304 if (!a->mult_work) { 2305 k = PetscMax(A->rmap->n,A->cmap->n); 2306 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 2307 } 2308 work = a->mult_work; 2309 xtmp = x; 2310 for (i=0; i<mbs; i++) { 2311 n = ii[1] - ii[0]; ii++; 2312 ncols = n*bs; 2313 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2314 if (usecprow) xtmp = x + bs*ridx[i]; 2315 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2316 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2317 v += n*bs2; 2318 if (!usecprow) xtmp += bs; 2319 workt = work; 2320 for (j=0; j<n; j++) { 2321 zb = z + bs*(*idx++); 2322 for (k=0; k<bs; k++) zb[k] += workt[k] ; 2323 workt += bs; 2324 } 2325 } 2326 } 2327 } 2328 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2329 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2330 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2331 PetscFunctionReturn(0); 2332 } 2333 2334 #undef __FUNCT__ 2335 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ" 2336 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz) 2337 { 2338 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2339 PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5; 2340 MatScalar *v; 2341 PetscErrorCode ierr; 2342 PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL; 2343 Mat_CompressedRow cprow = a->compressedrow; 2344 PetscBool usecprow=cprow.use; 2345 2346 PetscFunctionBegin; 2347 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); } 2348 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2349 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 2350 2351 idx = a->j; 2352 v = a->a; 2353 if (usecprow) { 2354 mbs = cprow.nrows; 2355 ii = cprow.i; 2356 ridx = cprow.rindex; 2357 } else { 2358 mbs=a->mbs; 2359 ii = a->i; 2360 xb = x; 2361 } 2362 2363 switch (bs) { 2364 case 1: 2365 for (i=0; i<mbs; i++) { 2366 if (usecprow) xb = x + ridx[i]; 2367 x1 = xb[0]; 2368 ib = idx + ii[0]; 2369 n = ii[1] - ii[0]; ii++; 2370 for (j=0; j<n; j++) { 2371 rval = ib[j]; 2372 z[rval] += *v * x1; 2373 v++; 2374 } 2375 if (!usecprow) xb++; 2376 } 2377 break; 2378 case 2: 2379 for (i=0; i<mbs; i++) { 2380 if (usecprow) xb = x + 2*ridx[i]; 2381 x1 = xb[0]; x2 = xb[1]; 2382 ib = idx + ii[0]; 2383 n = ii[1] - ii[0]; ii++; 2384 for (j=0; j<n; j++) { 2385 rval = ib[j]*2; 2386 z[rval++] += v[0]*x1 + v[1]*x2; 2387 z[rval++] += v[2]*x1 + v[3]*x2; 2388 v += 4; 2389 } 2390 if (!usecprow) xb += 2; 2391 } 2392 break; 2393 case 3: 2394 for (i=0; i<mbs; i++) { 2395 if (usecprow) xb = x + 3*ridx[i]; 2396 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2397 ib = idx + ii[0]; 2398 n = ii[1] - ii[0]; ii++; 2399 for (j=0; j<n; j++) { 2400 rval = ib[j]*3; 2401 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3; 2402 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3; 2403 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3; 2404 v += 9; 2405 } 2406 if (!usecprow) xb += 3; 2407 } 2408 break; 2409 case 4: 2410 for (i=0; i<mbs; i++) { 2411 if (usecprow) xb = x + 4*ridx[i]; 2412 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; 2413 ib = idx + ii[0]; 2414 n = ii[1] - ii[0]; ii++; 2415 for (j=0; j<n; j++) { 2416 rval = ib[j]*4; 2417 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 2418 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 2419 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 2420 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 2421 v += 16; 2422 } 2423 if (!usecprow) xb += 4; 2424 } 2425 break; 2426 case 5: 2427 for (i=0; i<mbs; i++) { 2428 if (usecprow) xb = x + 5*ridx[i]; 2429 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; 2430 x4 = xb[3]; x5 = xb[4]; 2431 ib = idx + ii[0]; 2432 n = ii[1] - ii[0]; ii++; 2433 for (j=0; j<n; j++) { 2434 rval = ib[j]*5; 2435 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 2436 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 2437 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 2438 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 2439 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 2440 v += 25; 2441 } 2442 if (!usecprow) xb += 5; 2443 } 2444 break; 2445 default: { /* block sizes larger then 5 by 5 are handled by BLAS */ 2446 PetscInt ncols,k; 2447 PetscScalar *work,*workt,*xtmp; 2448 2449 if (!a->mult_work) { 2450 k = PetscMax(A->rmap->n,A->cmap->n); 2451 ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr); 2452 } 2453 work = a->mult_work; 2454 xtmp = x; 2455 for (i=0; i<mbs; i++) { 2456 n = ii[1] - ii[0]; ii++; 2457 ncols = n*bs; 2458 ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr); 2459 if (usecprow) xtmp = x + bs*ridx[i]; 2460 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work); 2461 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */ 2462 v += n*bs2; 2463 if (!usecprow) xtmp += bs; 2464 workt = work; 2465 for (j=0; j<n; j++) { 2466 zb = z + bs*(*idx++); 2467 for (k=0; k<bs; k++) zb[k] += workt[k]; 2468 workt += bs; 2469 } 2470 } 2471 } 2472 } 2473 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2474 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 2475 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr); 2476 PetscFunctionReturn(0); 2477 } 2478 2479 #undef __FUNCT__ 2480 #define __FUNCT__ "MatScale_SeqBAIJ" 2481 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha) 2482 { 2483 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data; 2484 PetscInt totalnz = a->bs2*a->nz; 2485 PetscScalar oalpha = alpha; 2486 PetscErrorCode ierr; 2487 PetscBLASInt one = 1,tnz; 2488 2489 PetscFunctionBegin; 2490 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr); 2491 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one)); 2492 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr); 2493 PetscFunctionReturn(0); 2494 } 2495 2496 #undef __FUNCT__ 2497 #define __FUNCT__ "MatNorm_SeqBAIJ" 2498 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm) 2499 { 2500 PetscErrorCode ierr; 2501 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2502 MatScalar *v = a->a; 2503 PetscReal sum = 0.0; 2504 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1; 2505 2506 PetscFunctionBegin; 2507 if (type == NORM_FROBENIUS) { 2508 for (i=0; i< bs2*nz; i++) { 2509 sum += PetscRealPart(PetscConj(*v)*(*v)); v++; 2510 } 2511 *norm = PetscSqrtReal(sum); 2512 } else if (type == NORM_1) { /* maximum column sum */ 2513 PetscReal *tmp; 2514 PetscInt *bcol = a->j; 2515 ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr); 2516 ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr); 2517 for (i=0; i<nz; i++) { 2518 for (j=0; j<bs; j++) { 2519 k1 = bs*(*bcol) + j; /* column index */ 2520 for (k=0; k<bs; k++) { 2521 tmp[k1] += PetscAbsScalar(*v); v++; 2522 } 2523 } 2524 bcol++; 2525 } 2526 *norm = 0.0; 2527 for (j=0; j<A->cmap->n; j++) { 2528 if (tmp[j] > *norm) *norm = tmp[j]; 2529 } 2530 ierr = PetscFree(tmp);CHKERRQ(ierr); 2531 } else if (type == NORM_INFINITY) { /* maximum row sum */ 2532 *norm = 0.0; 2533 for (k=0; k<bs; k++) { 2534 for (j=0; j<a->mbs; j++) { 2535 v = a->a + bs2*a->i[j] + k; 2536 sum = 0.0; 2537 for (i=0; i<a->i[j+1]-a->i[j]; i++) { 2538 for (k1=0; k1<bs; k1++) { 2539 sum += PetscAbsScalar(*v); 2540 v += bs; 2541 } 2542 } 2543 if (sum > *norm) *norm = sum; 2544 } 2545 } 2546 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet"); 2547 PetscFunctionReturn(0); 2548 } 2549 2550 2551 #undef __FUNCT__ 2552 #define __FUNCT__ "MatEqual_SeqBAIJ" 2553 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg) 2554 { 2555 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 2556 PetscErrorCode ierr; 2557 2558 PetscFunctionBegin; 2559 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */ 2560 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) { 2561 *flg = PETSC_FALSE; 2562 PetscFunctionReturn(0); 2563 } 2564 2565 /* if the a->i are the same */ 2566 ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2567 if (!*flg) PetscFunctionReturn(0); 2568 2569 /* if a->j are the same */ 2570 ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr); 2571 if (!*flg) PetscFunctionReturn(0); 2572 2573 /* if a->a are the same */ 2574 ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr); 2575 PetscFunctionReturn(0); 2576 2577 } 2578 2579 #undef __FUNCT__ 2580 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ" 2581 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v) 2582 { 2583 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2584 PetscErrorCode ierr; 2585 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2; 2586 PetscScalar *x,zero = 0.0; 2587 MatScalar *aa,*aa_j; 2588 2589 PetscFunctionBegin; 2590 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 2591 bs = A->rmap->bs; 2592 aa = a->a; 2593 ai = a->i; 2594 aj = a->j; 2595 ambs = a->mbs; 2596 bs2 = a->bs2; 2597 2598 ierr = VecSet(v,zero);CHKERRQ(ierr); 2599 ierr = VecGetArray(v,&x);CHKERRQ(ierr); 2600 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr); 2601 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 2602 for (i=0; i<ambs; i++) { 2603 for (j=ai[i]; j<ai[i+1]; j++) { 2604 if (aj[j] == i) { 2605 row = i*bs; 2606 aa_j = aa+j*bs2; 2607 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k]; 2608 break; 2609 } 2610 } 2611 } 2612 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr); 2613 PetscFunctionReturn(0); 2614 } 2615 2616 #undef __FUNCT__ 2617 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ" 2618 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr) 2619 { 2620 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2621 const PetscScalar *l,*r,*li,*ri; 2622 PetscScalar x; 2623 MatScalar *aa, *v; 2624 PetscErrorCode ierr; 2625 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai; 2626 const PetscInt *ai,*aj; 2627 2628 PetscFunctionBegin; 2629 ai = a->i; 2630 aj = a->j; 2631 aa = a->a; 2632 m = A->rmap->n; 2633 n = A->cmap->n; 2634 bs = A->rmap->bs; 2635 mbs = a->mbs; 2636 bs2 = a->bs2; 2637 if (ll) { 2638 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr); 2639 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr); 2640 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length"); 2641 for (i=0; i<mbs; i++) { /* for each block row */ 2642 M = ai[i+1] - ai[i]; 2643 li = l + i*bs; 2644 v = aa + bs2*ai[i]; 2645 for (j=0; j<M; j++) { /* for each block */ 2646 for (k=0; k<bs2; k++) { 2647 (*v++) *= li[k%bs]; 2648 } 2649 } 2650 } 2651 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr); 2652 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2653 } 2654 2655 if (rr) { 2656 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr); 2657 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr); 2658 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length"); 2659 for (i=0; i<mbs; i++) { /* for each block row */ 2660 iai = ai[i]; 2661 M = ai[i+1] - iai; 2662 v = aa + bs2*iai; 2663 for (j=0; j<M; j++) { /* for each block */ 2664 ri = r + bs*aj[iai+j]; 2665 for (k=0; k<bs; k++) { 2666 x = ri[k]; 2667 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x; 2668 v += bs; 2669 } 2670 } 2671 } 2672 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr); 2673 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2674 } 2675 PetscFunctionReturn(0); 2676 } 2677 2678 2679 #undef __FUNCT__ 2680 #define __FUNCT__ "MatGetInfo_SeqBAIJ" 2681 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info) 2682 { 2683 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2684 2685 PetscFunctionBegin; 2686 info->block_size = a->bs2; 2687 info->nz_allocated = a->bs2*a->maxnz; 2688 info->nz_used = a->bs2*a->nz; 2689 info->nz_unneeded = (double)(info->nz_allocated - info->nz_used); 2690 info->assemblies = A->num_ass; 2691 info->mallocs = A->info.mallocs; 2692 info->memory = ((PetscObject)A)->mem; 2693 if (A->factortype) { 2694 info->fill_ratio_given = A->info.fill_ratio_given; 2695 info->fill_ratio_needed = A->info.fill_ratio_needed; 2696 info->factor_mallocs = A->info.factor_mallocs; 2697 } else { 2698 info->fill_ratio_given = 0; 2699 info->fill_ratio_needed = 0; 2700 info->factor_mallocs = 0; 2701 } 2702 PetscFunctionReturn(0); 2703 } 2704 2705 2706 #if defined(PETSC_THREADCOMM_ACTIVE) 2707 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A) 2708 { 2709 PetscErrorCode ierr; 2710 PetscInt *trstarts=A->rmap->trstarts; 2711 PetscInt n,start,end; 2712 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2713 2714 start = trstarts[thread_id]; 2715 end = trstarts[thread_id+1]; 2716 n = a->i[end] - a->i[start]; 2717 ierr = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(PetscScalar));CHKERRQ(ierr); 2718 return 0; 2719 } 2720 2721 #undef __FUNCT__ 2722 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2723 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2724 { 2725 PetscErrorCode ierr; 2726 2727 2728 PetscFunctionBegin; 2729 ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr); 2730 PetscFunctionReturn(0); 2731 } 2732 #else 2733 #undef __FUNCT__ 2734 #define __FUNCT__ "MatZeroEntries_SeqBAIJ" 2735 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A) 2736 { 2737 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2738 PetscErrorCode ierr; 2739 2740 PetscFunctionBegin; 2741 ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr); 2742 PetscFunctionReturn(0); 2743 } 2744 #endif 2745