xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision dcca6d9d80ebd869fe6029bd05a3aa9faafef49e)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 #if defined(PETSC_THREADCOMM_ACTIVE)
7 #include <petscthreadcomm.h>
8 #endif
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc((m+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
79 {
80   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
81   PetscErrorCode ierr;
82   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
83   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
84   const PetscInt *irow,*icol;
85   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
86   PetscInt       *aj = a->j,*ai = a->i;
87   MatScalar      *mat_a;
88   Mat            C;
89   PetscBool      flag,sorted;
90 
91   PetscFunctionBegin;
92   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
93   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
94 
95   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
96   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
97   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
98   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
99 
100   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
101   ssmap = smap;
102   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
103   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
104   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
105   /* determine lens of each row */
106   for (i=0; i<nrows; i++) {
107     kstart  = ai[irow[i]];
108     kend    = kstart + a->ilen[irow[i]];
109     lens[i] = 0;
110     for (k=kstart; k<kend; k++) {
111       if (ssmap[aj[k]]) lens[i]++;
112     }
113   }
114   /* Create and fill new matrix */
115   if (scall == MAT_REUSE_MATRIX) {
116     c = (Mat_SeqBAIJ*)((*B)->data);
117 
118     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
119     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
120     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
122     C    = *B;
123   } else {
124     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
125     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
126     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
127     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
128   }
129   c = (Mat_SeqBAIJ*)(C->data);
130   for (i=0; i<nrows; i++) {
131     row      = irow[i];
132     kstart   = ai[row];
133     kend     = kstart + a->ilen[row];
134     mat_i    = c->i[i];
135     mat_j    = c->j + mat_i;
136     mat_a    = c->a + mat_i*bs2;
137     mat_ilen = c->ilen + i;
138     for (k=kstart; k<kend; k++) {
139       if ((tcol=ssmap[a->j[k]])) {
140         *mat_j++ = tcol - 1;
141         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
142         mat_a   += bs2;
143         (*mat_ilen)++;
144       }
145     }
146   }
147 
148   /* Free work space */
149   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
150   ierr = PetscFree(smap);CHKERRQ(ierr);
151   ierr = PetscFree(lens);CHKERRQ(ierr);
152   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
153   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
154 
155   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
156   *B   = C;
157   PetscFunctionReturn(0);
158 }
159 
160 #undef __FUNCT__
161 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
162 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
163 {
164   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
165   IS             is1,is2;
166   PetscErrorCode ierr;
167   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
168   const PetscInt *irow,*icol;
169 
170   PetscFunctionBegin;
171   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
172   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
173   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
174   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
175 
176   /* Verify if the indices corespond to each element in a block
177    and form the IS with compressed IS */
178   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
179   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
180   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
181   count = 0;
182   for (i=0; i<a->mbs; i++) {
183     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
184     if (vary[i]==bs) iary[count++] = i;
185   }
186   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
187 
188   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
189   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
190   count = 0;
191   for (i=0; i<a->mbs; i++) {
192     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
193     if (vary[i]==bs) iary[count++] = i;
194   }
195   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
196   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
197   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
198   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
199 
200   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
201   ierr = ISDestroy(&is1);CHKERRQ(ierr);
202   ierr = ISDestroy(&is2);CHKERRQ(ierr);
203   PetscFunctionReturn(0);
204 }
205 
206 #undef __FUNCT__
207 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
208 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
209 {
210   PetscErrorCode ierr;
211   PetscInt       i;
212 
213   PetscFunctionBegin;
214   if (scall == MAT_INITIAL_MATRIX) {
215     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
216   }
217 
218   for (i=0; i<n; i++) {
219     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
220   }
221   PetscFunctionReturn(0);
222 }
223 
224 
225 /* -------------------------------------------------------*/
226 /* Should check that shapes of vectors and matrices match */
227 /* -------------------------------------------------------*/
228 
229 #if defined(PETSC_THREADCOMM_ACTIVE)
230 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
231 {
232   PetscErrorCode    ierr;
233   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
234   PetscScalar       *z;
235   const PetscScalar *x;
236   const MatScalar   *aa;
237   PetscInt          *trstarts=A->rmap->trstarts;
238   PetscInt          n,start,end,i;
239   const PetscInt    *aj,*ai;
240   PetscScalar       sum;
241 
242   ierr  = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
243   ierr  = VecGetArray(zz,&z);CHKERRQ(ierr);
244   start = trstarts[thread_id];
245   end   = trstarts[thread_id+1];
246   ai    = a->i;
247   for (i=start; i<end; i++) {
248     n   = ai[i+1] - ai[i];
249     aj  = a->j + ai[i];
250     aa  = a->a + ai[i];
251     sum = 0.0;
252     PetscSparseDensePlusDot(sum,x,aa,aj,n);
253     z[i] = sum;
254   }
255   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
256   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
257   return 0;
258 }
259 
260 #undef __FUNCT__
261 #define __FUNCT__ "MatMult_SeqBAIJ_1"
262 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
263 {
264   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
265   PetscScalar       *z,sum;
266   const PetscScalar *x;
267   const MatScalar   *v;
268   PetscErrorCode    ierr;
269   PetscInt          mbs,i,n;
270   const PetscInt    *idx,*ii,*ridx=NULL;
271   PetscBool         usecprow=a->compressedrow.use;
272 
273   PetscFunctionBegin;
274 
275   if (usecprow) {
276     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
277     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
278     mbs  = a->compressedrow.nrows;
279     ii   = a->compressedrow.i;
280     ridx = a->compressedrow.rindex;
281     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
282     for (i=0; i<mbs; i++) {
283       n   = ii[i+1] - ii[i];
284       v   = a->a + ii[i];
285       idx = a->j + ii[i];
286       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
287       PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
288       sum = 0.0;
289       PetscSparseDensePlusDot(sum,x,v,idx,n);
290       z[ridx[i]] = sum;
291     }
292     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
293     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
294   } else {
295     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr);
296   }
297   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
298   PetscFunctionReturn(0);
299 }
300 #else
301 #undef __FUNCT__
302 #define __FUNCT__ "MatMult_SeqBAIJ_1"
303 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
304 {
305   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
306   PetscScalar       *z,sum;
307   const PetscScalar *x;
308   const MatScalar   *v;
309   PetscErrorCode    ierr;
310   PetscInt          mbs,i,n;
311   const PetscInt    *idx,*ii,*ridx=NULL;
312   PetscBool         usecprow=a->compressedrow.use;
313 
314   PetscFunctionBegin;
315   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
316   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
317 
318   if (usecprow) {
319     mbs  = a->compressedrow.nrows;
320     ii   = a->compressedrow.i;
321     ridx = a->compressedrow.rindex;
322     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
323   } else {
324     mbs = a->mbs;
325     ii  = a->i;
326   }
327 
328   for (i=0; i<mbs; i++) {
329     n   = ii[1] - ii[0];
330     v   = a->a + ii[0];
331     idx = a->j + ii[0];
332     ii++;
333     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
334     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
335     sum = 0.0;
336     PetscSparseDensePlusDot(sum,x,v,idx,n);
337     if (usecprow) {
338       z[ridx[i]] = sum;
339     } else {
340       z[i]        = sum;
341     }
342   }
343   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
344   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
345   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
346   PetscFunctionReturn(0);
347 }
348 #endif
349 
350 #if defined(PETSC_THREADCOMM_ACTIVE)
351 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
352 {
353   PetscErrorCode    ierr;
354   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
355   PetscScalar       *z,x1,x2,sum1,sum2;
356   const PetscScalar *x,*xb;
357   const MatScalar   *aa;
358   PetscInt          *trstarts=A->rmap->trstarts;
359   PetscInt          n,start,end,i,j;
360   const PetscInt    *aj,*ai;
361 
362   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
363   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
364   start  = trstarts[thread_id] / 2;
365   end    = trstarts[thread_id+1] / 2;
366   ai     = a->i;
367   for (i=start; i<end; i++) {
368     n    = ai[i+1] - ai[i];
369     aj   = a->j + ai[i];
370     aa   = a->a + ai[i]*4;
371     sum1 = 0.0; sum2 = 0.0;
372     for (j=0; j<n; j++) {
373       xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1];
374       sum1 += aa[4*j]*x1   + aa[4*j+2]*x2;
375       sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2;
376     }
377     z[2*i] = sum1; z[2*i+1] = sum2;
378   }
379   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
380   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
381   return 0;
382 }
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatMult_SeqBAIJ_2"
386 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
387 {
388   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
389   PetscScalar       *z,x1,x2,sum1,sum2;
390   const PetscScalar *x,*xb;
391   const MatScalar   *v;
392   PetscErrorCode    ierr;
393   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
394   PetscBool         usecprow=a->compressedrow.use;
395 
396   PetscFunctionBegin;
397 
398   if (usecprow) {
399     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
400     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
401     mbs  = a->compressedrow.nrows;
402     ii   = a->compressedrow.i;
403     ridx = a->compressedrow.rindex;
404     for (i=0; i<mbs; i++) {
405       n    = ii[i+1] - ii[i];
406       idx  = a->j + ii[i];
407       v    = a->a + ii[i]*4;
408       sum1 = 0.0; sum2 = 0.0;
409       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
410       PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
411       for (j=0; j<n; j++) {
412 	xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1];
413 	sum1 += v[4*j]*x1   + v[4*j+2]*x2;
414 	sum2 += v[4*j+1]*x1 + v[4*j+3]*x2;
415       }
416       z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2;
417     }
418     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
419     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
420   } else {
421     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr);
422   }
423   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
424   PetscFunctionReturn(0);
425 }
426 #else
427 #undef __FUNCT__
428 #define __FUNCT__ "MatMult_SeqBAIJ_2"
429 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
430 {
431   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
432   PetscScalar       *z = 0,sum1,sum2,*zarray;
433   const PetscScalar *x,*xb;
434   PetscScalar       x1,x2;
435   const MatScalar   *v;
436   PetscErrorCode    ierr;
437   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
438   PetscBool         usecprow=a->compressedrow.use;
439 
440   PetscFunctionBegin;
441   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
442   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
443 
444   idx = a->j;
445   v   = a->a;
446   if (usecprow) {
447     mbs  = a->compressedrow.nrows;
448     ii   = a->compressedrow.i;
449     ridx = a->compressedrow.rindex;
450   } else {
451     mbs = a->mbs;
452     ii  = a->i;
453     z   = zarray;
454   }
455 
456   for (i=0; i<mbs; i++) {
457     n           = ii[1] - ii[0]; ii++;
458     sum1        = 0.0; sum2 = 0.0;
459     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
460     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
461     for (j=0; j<n; j++) {
462       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
463       sum1 += v[0]*x1 + v[2]*x2;
464       sum2 += v[1]*x1 + v[3]*x2;
465       v    += 4;
466     }
467     if (usecprow) z = zarray + 2*ridx[i];
468     z[0] = sum1; z[1] = sum2;
469     if (!usecprow) z += 2;
470   }
471   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
472   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
473   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
474   PetscFunctionReturn(0);
475 }
476 #endif
477 
478 #if defined(PETSC_THREADCOMM_ACTIVE)
479 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
480 {
481   PetscErrorCode    ierr;
482   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
483   PetscScalar       *z,x1,x2,x3,sum1,sum2,sum3;
484   const PetscScalar *x,*xb;
485   const MatScalar   *aa;
486   PetscInt          *trstarts=A->rmap->trstarts;
487   PetscInt          n,start,end,i,j;
488   const PetscInt    *aj,*ai;
489 
490   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
491   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
492   start  = trstarts[thread_id] / 3;
493   end    = trstarts[thread_id+1] / 3;
494   ai     = a->i;
495   for (i=start; i<end; i++) {
496     n    = ai[i+1] - ai[i];
497     aj   = a->j + ai[i];
498     aa   = a->a + ai[i]*9;
499     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
500     for (j=0; j<n; j++) {
501       xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
502       sum1 += aa[9*j]*x1   + aa[9*j+3]*x2 + aa[9*j+6]*x3;
503       sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3;
504       sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3;
505     }
506     z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3;
507   }
508   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
509   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510   return 0;
511 }
512 
513 #undef __FUNCT__
514 #define __FUNCT__ "MatMult_SeqBAIJ_3"
515 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
516 {
517   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
518   PetscScalar       *z,sum1,sum2,sum3,x1,x2,x3;
519   const PetscScalar *x,*xb;
520   const MatScalar   *v;
521   PetscErrorCode    ierr;
522   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
523   PetscBool         usecprow=a->compressedrow.use;
524 
525 
526 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
527 #pragma disjoint(*v,*z,*xb)
528 #endif
529 
530   PetscFunctionBegin;
531 
532   if (usecprow) {
533     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
534     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
535     mbs  = a->compressedrow.nrows;
536     ii   = a->compressedrow.i;
537     ridx = a->compressedrow.rindex;
538     for (i=0; i<mbs; i++) {
539       n    = ii[i+1] - ii[i];
540       idx  = a->j + ii[i];
541       v    = a->a + ii[i]*9;
542       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
543       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
544       PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
545       for (j=0; j<n; j++) {
546 	xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
547 	sum1 += v[9*j]*x1   + v[9*j+3]*x2 + v[9*j+6]*x3;
548 	sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3;
549 	sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3;
550       }
551       z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3;
552     }
553     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
554     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
555   } else {
556     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr);
557   }
558   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
559   PetscFunctionReturn(0);
560 }
561 #else
562 #undef __FUNCT__
563 #define __FUNCT__ "MatMult_SeqBAIJ_3"
564 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
565 {
566   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
567   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
568   const PetscScalar *x,*xb;
569   const MatScalar   *v;
570   PetscErrorCode    ierr;
571   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
572   PetscBool         usecprow=a->compressedrow.use;
573 
574 
575 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
576 #pragma disjoint(*v,*z,*xb)
577 #endif
578 
579   PetscFunctionBegin;
580   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
581   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
582 
583   idx = a->j;
584   v   = a->a;
585   if (usecprow) {
586     mbs  = a->compressedrow.nrows;
587     ii   = a->compressedrow.i;
588     ridx = a->compressedrow.rindex;
589   } else {
590     mbs = a->mbs;
591     ii  = a->i;
592     z   = zarray;
593   }
594 
595   for (i=0; i<mbs; i++) {
596     n           = ii[1] - ii[0]; ii++;
597     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
598     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
599     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
600     for (j=0; j<n; j++) {
601       xb = x + 3*(*idx++);
602       x1 = xb[0];
603       x2 = xb[1];
604       x3 = xb[2];
605 
606       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
607       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
608       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
609       v    += 9;
610     }
611     if (usecprow) z = zarray + 3*ridx[i];
612     z[0] = sum1; z[1] = sum2; z[2] = sum3;
613     if (!usecprow) z += 3;
614   }
615   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
616   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
617   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
618   PetscFunctionReturn(0);
619 }
620 #endif
621 
622 #if defined(PETSC_THREADCOMM_ACTIVE)
623 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
624 {
625   PetscErrorCode    ierr;
626   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
627   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
628   const PetscScalar *x,*xb;
629   const MatScalar   *aa;
630   PetscInt          *trstarts=A->rmap->trstarts;
631   PetscInt          n,start,end,i,j;
632   const PetscInt    *aj,*ai;
633 
634   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
635   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
636   start  = trstarts[thread_id] / 4;
637   end    = trstarts[thread_id+1] / 4;
638   ai     = a->i;
639   for (i=start; i<end; i++) {
640     n    = ai[i+1] - ai[i];
641     aj   = a->j + ai[i];
642     aa   = a->a + ai[i]*16;
643     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
644     for (j=0; j<n; j++) {
645       xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
646       sum1 += aa[16*j]*x1   + aa[16*j+4]*x2 + aa[16*j+8]*x3  + aa[16*j+12]*x4;
647       sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3  + aa[16*j+13]*x4;
648       sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4;
649       sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4;
650     }
651     z[4*i]   = sum1; z[4*i+1] = sum2;
652     z[4*i+2] = sum3; z[4*i+3] = sum4;
653   }
654   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
655   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
656   return 0;
657 }
658 
659 #undef __FUNCT__
660 #define __FUNCT__ "MatMult_SeqBAIJ_4"
661 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
662 {
663   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
664   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
665   const PetscScalar *x,*xb;
666   const MatScalar   *v;
667   PetscErrorCode    ierr;
668   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
669   PetscBool         usecprow=a->compressedrow.use;
670 
671   PetscFunctionBegin;
672 
673   if (usecprow) {
674     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
675     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
676     mbs  = a->compressedrow.nrows;
677     ii   = a->compressedrow.i;
678     ridx = a->compressedrow.rindex;
679     for (i=0; i<mbs; i++) {
680       n = ii[i+1] - ii[1];
681       idx  = a->j + ii[i];
682       v    = a->a + ii[i]*16;
683       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
684       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
685       PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
686       for (j=0; j<n; j++) {
687 	xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
688 	sum1 += v[16*j]*x1   + v[16*j+4]*x2 + v[16*j+8]*x3  + v[16*j+12]*x4;
689 	sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3  + v[16*j+13]*x4;
690 	sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4;
691 	sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4;
692       }
693       z[4*ridx[i]]   = sum1; z[4*ridx[i]+1] = sum2;
694       z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4;
695     }
696     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
697     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
698   } else {
699     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr);
700   }
701   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
702   PetscFunctionReturn(0);
703 }
704 #else
705 #undef __FUNCT__
706 #define __FUNCT__ "MatMult_SeqBAIJ_4"
707 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
708 {
709   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
710   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
711   const PetscScalar *x,*xb;
712   const MatScalar   *v;
713   PetscErrorCode    ierr;
714   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
715   PetscBool         usecprow=a->compressedrow.use;
716 
717   PetscFunctionBegin;
718   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
719   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
720 
721   idx = a->j;
722   v   = a->a;
723   if (usecprow) {
724     mbs  = a->compressedrow.nrows;
725     ii   = a->compressedrow.i;
726     ridx = a->compressedrow.rindex;
727   } else {
728     mbs = a->mbs;
729     ii  = a->i;
730     z   = zarray;
731   }
732 
733   for (i=0; i<mbs; i++) {
734     n = ii[1] - ii[0];
735     ii++;
736     sum1 = 0.0;
737     sum2 = 0.0;
738     sum3 = 0.0;
739     sum4 = 0.0;
740 
741     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
742     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
743     for (j=0; j<n; j++) {
744       xb    = x + 4*(*idx++);
745       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
746       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
747       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
748       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
749       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
750       v    += 16;
751     }
752     if (usecprow) z = zarray + 4*ridx[i];
753     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
754     if (!usecprow) z += 4;
755   }
756   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
757   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
758   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
759   PetscFunctionReturn(0);
760 }
761 #endif
762 
763 #if defined(PETSC_THREADCOMM_ACTIVE)
764 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
765 {
766   PetscErrorCode    ierr;
767   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
768   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
769   const PetscScalar *x,*xb;
770   const MatScalar   *aa;
771   PetscInt          *trstarts=A->rmap->trstarts;
772   PetscInt          n,start,end,i,j;
773   const PetscInt    *aj,*ai;
774 
775   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
776   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
777   start  = trstarts[thread_id] / 5;
778   end    = trstarts[thread_id+1] / 5;
779   ai     = a->i;
780   for (i=start; i<end; i++) {
781     n    = ai[i+1] - ai[i];
782     aj   = a->j + ai[i];
783     aa   = a->a + ai[i]*25;
784     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
785     for (j=0; j<n; j++) {
786       xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
787       sum1 += aa[25*j]*x1   + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5;
788       sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5;
789       sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5;
790       sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5;
791       sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5;
792     }
793     z[5*i]   = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3;
794     z[5*i+3] = sum4; z[5*i+4] = sum5;
795   }
796   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
797   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
798   return 0;
799 }
800 
801 #undef __FUNCT__
802 #define __FUNCT__ "MatMult_SeqBAIJ_5"
803 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
804 {
805   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
806   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
807   const PetscScalar *xb,*x;
808   const MatScalar   *v;
809   PetscErrorCode    ierr;
810   const PetscInt    *idx,*ii,*ridx=NULL;
811   PetscInt          mbs,i,j,n;
812   PetscBool         usecprow=a->compressedrow.use;
813 
814   PetscFunctionBegin;
815 
816   if (usecprow) {
817     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
818     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
819     mbs  = a->compressedrow.nrows;
820     ii   = a->compressedrow.i;
821     ridx = a->compressedrow.rindex;
822     for (i=0; i<mbs; i++) {
823       n    = ii[i+1] - ii[i];
824       idx  = a->j + ii[i];
825       v    = a->a + ii[i]*25;
826       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
827       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
828       PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
829       for (j=0; j<n; j++) {
830 	xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
831 	sum1 += v[25*j]*x1   + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5;
832 	sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5;
833 	sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5;
834 	sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5;
835 	sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5;
836       }
837       z[5*ridx[i]]   = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3;
838       z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5;
839     }
840     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
841     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
842   } else {
843     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr);
844   }
845   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
846   PetscFunctionReturn(0);
847 }
848 #else
849 #undef __FUNCT__
850 #define __FUNCT__ "MatMult_SeqBAIJ_5"
851 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
852 {
853   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
854   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
855   const PetscScalar *xb,*x;
856   const MatScalar   *v;
857   PetscErrorCode    ierr;
858   const PetscInt    *idx,*ii,*ridx=NULL;
859   PetscInt          mbs,i,j,n;
860   PetscBool         usecprow=a->compressedrow.use;
861 
862   PetscFunctionBegin;
863   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
864   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
865 
866   idx = a->j;
867   v   = a->a;
868   if (usecprow) {
869     mbs  = a->compressedrow.nrows;
870     ii   = a->compressedrow.i;
871     ridx = a->compressedrow.rindex;
872   } else {
873     mbs = a->mbs;
874     ii  = a->i;
875     z   = zarray;
876   }
877 
878   for (i=0; i<mbs; i++) {
879     n           = ii[1] - ii[0]; ii++;
880     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
881     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
882     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
883     for (j=0; j<n; j++) {
884       xb    = x + 5*(*idx++);
885       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
886       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
887       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
888       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
889       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
890       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
891       v    += 25;
892     }
893     if (usecprow) z = zarray + 5*ridx[i];
894     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
895     if (!usecprow) z += 5;
896   }
897   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
898   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
899   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
900   PetscFunctionReturn(0);
901 }
902 #endif
903 
904 
905 #if defined(PETSC_THREADCOMM_ACTIVE)
906 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
907 {
908   PetscErrorCode    ierr;
909   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
910   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
911   const PetscScalar *x,*xb;
912   const MatScalar   *aa;
913   PetscInt          *trstarts=A->rmap->trstarts;
914   PetscInt          n,start,end,i,j;
915   const PetscInt    *aj,*ai;
916 
917   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
918   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
919   start  = trstarts[thread_id] / 6;
920   end    = trstarts[thread_id+1] / 6;
921   ai     = a->i;
922   for (i=start; i<end; i++) {
923     n    = ai[i+1] - ai[i];
924     aj   = a->j + ai[i];
925     aa   = a->a + ai[i]*36;
926     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
927     for (j=0; j<n; j++) {
928       xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
929       sum1 += aa[36*j]*x1   + aa[36*j+6]*x2  + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6;
930       sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2  + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6;
931       sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2  + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6;
932       sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2  + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6;
933       sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6;
934       sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6;
935     }
936     z[6*i]   = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3;
937     z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6;
938   }
939   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
940   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
941   return 0;
942 }
943 
944 #undef __FUNCT__
945 #define __FUNCT__ "MatMult_SeqBAIJ_6"
946 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
947 {
948   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
949   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
950   const PetscScalar *x,*xb;
951   const MatScalar   *v;
952   PetscErrorCode    ierr;
953   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
954   PetscBool         usecprow=a->compressedrow.use;
955 
956   PetscFunctionBegin;
957 
958   if (usecprow) {
959     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
960     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
961     mbs  = a->compressedrow.nrows;
962     ii   = a->compressedrow.i;
963     ridx = a->compressedrow.rindex;
964     for (i=0; i<mbs; i++) {
965       n  = ii[i+1] - ii[i];
966       idx  = a->j + ii[i];
967       v    = a->a + ii[i]*36;
968       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
969       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
970       PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
971       for (j=0; j<n; j++) {
972 	xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
973 	sum1 += v[36*j]*x1   + v[36*j+6]*x2  + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6;
974 	sum2 += v[36*j+1]*x1 + v[36*j+7]*x2  + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6;
975 	sum3 += v[36*j+2]*x1 + v[36*j+8]*x2  + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6;
976 	sum4 += v[36*j+3]*x1 + v[36*j+9]*x2  + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6;
977 	sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6;
978 	sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6;
979       }
980       z[6*ridx[i]]   = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3;
981       z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6;
982     }
983     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
984     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
985   } else {
986     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr);
987   }
988   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
989   PetscFunctionReturn(0);
990 }
991 #else
992 #undef __FUNCT__
993 #define __FUNCT__ "MatMult_SeqBAIJ_6"
994 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
995 {
996   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
997   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
998   const PetscScalar *x,*xb;
999   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
1000   const MatScalar   *v;
1001   PetscErrorCode    ierr;
1002   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1003   PetscBool         usecprow=a->compressedrow.use;
1004 
1005   PetscFunctionBegin;
1006   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1007   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1008 
1009   idx = a->j;
1010   v   = a->a;
1011   if (usecprow) {
1012     mbs  = a->compressedrow.nrows;
1013     ii   = a->compressedrow.i;
1014     ridx = a->compressedrow.rindex;
1015   } else {
1016     mbs = a->mbs;
1017     ii  = a->i;
1018     z   = zarray;
1019   }
1020 
1021   for (i=0; i<mbs; i++) {
1022     n  = ii[1] - ii[0];
1023     ii++;
1024     sum1 = 0.0;
1025     sum2 = 0.0;
1026     sum3 = 0.0;
1027     sum4 = 0.0;
1028     sum5 = 0.0;
1029     sum6 = 0.0;
1030 
1031     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1032     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1033     for (j=0; j<n; j++) {
1034       xb    = x + 6*(*idx++);
1035       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1036       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1037       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1038       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1039       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1040       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1041       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1042       v    += 36;
1043     }
1044     if (usecprow) z = zarray + 6*ridx[i];
1045     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1046     if (!usecprow) z += 6;
1047   }
1048 
1049   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1050   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1051   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
1052   PetscFunctionReturn(0);
1053 }
1054 #endif
1055 
1056 #if defined(PETSC_THREADCOMM_ACTIVE)
1057 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
1058 {
1059   PetscErrorCode    ierr;
1060   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1061   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1062   const PetscScalar *x,*xb;
1063   const MatScalar   *aa;
1064   PetscInt          *trstarts=A->rmap->trstarts;
1065   PetscInt          n,start,end,i,j;
1066   const PetscInt    *aj,*ai;
1067 
1068   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1069   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
1070   start  = trstarts[thread_id] / 7;
1071   end    = trstarts[thread_id+1] / 7;
1072   ai     = a->i;
1073   for (i=start; i<end; i++) {
1074     n    = ai[i+1] - ai[i];
1075     aj   = a->j + ai[i];
1076     aa   = a->a + ai[i]*49;
1077     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1078     for (j=0; j<n; j++) {
1079       xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1080       sum1 += aa[49*j]*x1   + aa[49*j+7]*x2  + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7;
1081       sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2  + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7;
1082       sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2  + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7;
1083       sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7;
1084       sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7;
1085       sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7;
1086       sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7;
1087     }
1088     z[7*i]   = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4;
1089     z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7;
1090   }
1091   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1092   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1093   return 0;
1094 }
1095 
1096 #undef __FUNCT__
1097 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1098 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1099 {
1100   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1101   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1102   const PetscScalar *x,*xb;
1103   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1104   const MatScalar   *v;
1105   PetscErrorCode    ierr;
1106   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1107   PetscBool         usecprow=a->compressedrow.use;
1108 
1109   PetscFunctionBegin;
1110 
1111   if (usecprow) {
1112     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1113     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1114     mbs  = a->compressedrow.nrows;
1115     ii   = a->compressedrow.i;
1116     ridx = a->compressedrow.rindex;
1117     for (i=0; i<mbs; i++) {
1118       n    = ii[i+1] - ii[i];
1119       idx  = a->j + ii[i];
1120       v    = a->a + ii[i]*49;
1121       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1122       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1123       PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1124       for (j=0; j<n; j++) {
1125 	xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1126 	sum1 += v[49*j]*x1   + v[49*j+7]*x2  + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7;
1127 	sum2 += v[49*j+1]*x1 + v[49*j+8]*x2  + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7;
1128 	sum3 += v[49*j+2]*x1 + v[49*j+9]*x2  + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7;
1129 	sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7;
1130 	sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7;
1131 	sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7;
1132 	sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7;
1133       }
1134       z[7*ridx[i]]   = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4;
1135       z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7;
1136     }
1137     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1138     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1139   } else {
1140     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1141   }
1142   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1143   PetscFunctionReturn(0);
1144 }
1145 #else
1146 #undef __FUNCT__
1147 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1148 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1149 {
1150   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1151   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1152   const PetscScalar *x,*xb;
1153   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1154   const MatScalar   *v;
1155   PetscErrorCode    ierr;
1156   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1157   PetscBool         usecprow=a->compressedrow.use;
1158 
1159   PetscFunctionBegin;
1160   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1161   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1162 
1163   idx = a->j;
1164   v   = a->a;
1165   if (usecprow) {
1166     mbs  = a->compressedrow.nrows;
1167     ii   = a->compressedrow.i;
1168     ridx = a->compressedrow.rindex;
1169   } else {
1170     mbs = a->mbs;
1171     ii  = a->i;
1172     z   = zarray;
1173   }
1174 
1175   for (i=0; i<mbs; i++) {
1176     n  = ii[1] - ii[0];
1177     ii++;
1178     sum1 = 0.0;
1179     sum2 = 0.0;
1180     sum3 = 0.0;
1181     sum4 = 0.0;
1182     sum5 = 0.0;
1183     sum6 = 0.0;
1184     sum7 = 0.0;
1185 
1186     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1187     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1188     for (j=0; j<n; j++) {
1189       xb    = x + 7*(*idx++);
1190       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1191       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1192       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1193       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1194       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1195       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1196       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1197       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1198       v    += 49;
1199     }
1200     if (usecprow) z = zarray + 7*ridx[i];
1201     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1202     if (!usecprow) z += 7;
1203   }
1204 
1205   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1206   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1207   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1208   PetscFunctionReturn(0);
1209 }
1210 #endif
1211 
1212 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1213 /* Default MatMult for block size 15 */
1214 
1215 #undef __FUNCT__
1216 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
1217 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1218 {
1219   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1220   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1221   const PetscScalar *x,*xb;
1222   PetscScalar       *zarray,xv;
1223   const MatScalar   *v;
1224   PetscErrorCode    ierr;
1225   const PetscInt    *ii,*ij=a->j,*idx;
1226   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1227   PetscBool         usecprow=a->compressedrow.use;
1228 
1229   PetscFunctionBegin;
1230   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1231   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1232 
1233   v = a->a;
1234   if (usecprow) {
1235     mbs  = a->compressedrow.nrows;
1236     ii   = a->compressedrow.i;
1237     ridx = a->compressedrow.rindex;
1238   } else {
1239     mbs = a->mbs;
1240     ii  = a->i;
1241     z   = zarray;
1242   }
1243 
1244   for (i=0; i<mbs; i++) {
1245     n    = ii[i+1] - ii[i];
1246     idx  = ij + ii[i];
1247     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1248     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1249 
1250     for (j=0; j<n; j++) {
1251       xb = x + 15*(idx[j]);
1252 
1253       for (k=0; k<15; k++) {
1254         xv     =  xb[k];
1255         sum1  += v[0]*xv;
1256         sum2  += v[1]*xv;
1257         sum3  += v[2]*xv;
1258         sum4  += v[3]*xv;
1259         sum5  += v[4]*xv;
1260         sum6  += v[5]*xv;
1261         sum7  += v[6]*xv;
1262         sum8  += v[7]*xv;
1263         sum9  += v[8]*xv;
1264         sum10 += v[9]*xv;
1265         sum11 += v[10]*xv;
1266         sum12 += v[11]*xv;
1267         sum13 += v[12]*xv;
1268         sum14 += v[13]*xv;
1269         sum15 += v[14]*xv;
1270         v     += 15;
1271       }
1272     }
1273     if (usecprow) z = zarray + 15*ridx[i];
1274     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1275     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1276 
1277     if (!usecprow) z += 15;
1278   }
1279 
1280   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1281   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1282   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1283   PetscFunctionReturn(0);
1284 }
1285 
1286 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1287 #undef __FUNCT__
1288 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
1289 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1290 {
1291   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1292   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1293   const PetscScalar *x,*xb;
1294   PetscScalar       x1,x2,x3,x4,*zarray;
1295   const MatScalar   *v;
1296   PetscErrorCode    ierr;
1297   const PetscInt    *ii,*ij=a->j,*idx;
1298   PetscInt          mbs,i,j,n,*ridx=NULL;
1299   PetscBool         usecprow=a->compressedrow.use;
1300 
1301   PetscFunctionBegin;
1302   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1303   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1304 
1305   v = a->a;
1306   if (usecprow) {
1307     mbs  = a->compressedrow.nrows;
1308     ii   = a->compressedrow.i;
1309     ridx = a->compressedrow.rindex;
1310   } else {
1311     mbs = a->mbs;
1312     ii  = a->i;
1313     z   = zarray;
1314   }
1315 
1316   for (i=0; i<mbs; i++) {
1317     n    = ii[i+1] - ii[i];
1318     idx  = ij + ii[i];
1319     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1320     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1321 
1322     for (j=0; j<n; j++) {
1323       xb = x + 15*(idx[j]);
1324       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1325 
1326       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1327       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1328       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1329       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1330       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1331       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1332       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1333       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1334       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1335       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1336       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1337       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1338       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1339       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1340       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1341 
1342       v += 60;
1343 
1344       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1345 
1346       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1347       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1348       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1349       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1350       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1351       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1352       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1353       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1354       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1355       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1356       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1357       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1358       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1359       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1360       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1361       v     += 60;
1362 
1363       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1364       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1365       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1366       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1367       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1368       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1369       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1370       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1371       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1372       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1373       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1374       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1375       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1376       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1377       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1378       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1379       v     += 60;
1380 
1381       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1382       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1383       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1384       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1385       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1386       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1387       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1388       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1389       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1390       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1391       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1392       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1393       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1394       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1395       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1396       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1397       v     += 45;
1398     }
1399     if (usecprow) z = zarray + 15*ridx[i];
1400     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1401     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1402 
1403     if (!usecprow) z += 15;
1404   }
1405 
1406   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1407   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1408   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1409   PetscFunctionReturn(0);
1410 }
1411 
1412 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1413 #undef __FUNCT__
1414 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
1415 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1416 {
1417   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1418   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1419   const PetscScalar *x,*xb;
1420   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1421   const MatScalar   *v;
1422   PetscErrorCode    ierr;
1423   const PetscInt    *ii,*ij=a->j,*idx;
1424   PetscInt          mbs,i,j,n,*ridx=NULL;
1425   PetscBool         usecprow=a->compressedrow.use;
1426 
1427   PetscFunctionBegin;
1428   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1429   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1430 
1431   v = a->a;
1432   if (usecprow) {
1433     mbs  = a->compressedrow.nrows;
1434     ii   = a->compressedrow.i;
1435     ridx = a->compressedrow.rindex;
1436   } else {
1437     mbs = a->mbs;
1438     ii  = a->i;
1439     z   = zarray;
1440   }
1441 
1442   for (i=0; i<mbs; i++) {
1443     n    = ii[i+1] - ii[i];
1444     idx  = ij + ii[i];
1445     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1446     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1447 
1448     for (j=0; j<n; j++) {
1449       xb = x + 15*(idx[j]);
1450       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1451       x8 = xb[7];
1452 
1453       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1454       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1455       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1456       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1457       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1458       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1459       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1460       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1461       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1462       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1463       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1464       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1465       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1466       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1467       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1468       v     += 120;
1469 
1470       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1471 
1472       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1473       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1474       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1475       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1476       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1477       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1478       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1479       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1480       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1481       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1482       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1483       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1484       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1485       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1486       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1487       v     += 105;
1488     }
1489     if (usecprow) z = zarray + 15*ridx[i];
1490     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1491     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1492 
1493     if (!usecprow) z += 15;
1494   }
1495 
1496   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1497   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1498   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1499   PetscFunctionReturn(0);
1500 }
1501 
1502 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1503 
1504 #undef __FUNCT__
1505 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
1506 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1507 {
1508   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1509   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1510   const PetscScalar *x,*xb;
1511   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1512   const MatScalar   *v;
1513   PetscErrorCode    ierr;
1514   const PetscInt    *ii,*ij=a->j,*idx;
1515   PetscInt          mbs,i,j,n,*ridx=NULL;
1516   PetscBool         usecprow=a->compressedrow.use;
1517 
1518   PetscFunctionBegin;
1519   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1520   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1521 
1522   v = a->a;
1523   if (usecprow) {
1524     mbs  = a->compressedrow.nrows;
1525     ii   = a->compressedrow.i;
1526     ridx = a->compressedrow.rindex;
1527   } else {
1528     mbs = a->mbs;
1529     ii  = a->i;
1530     z   = zarray;
1531   }
1532 
1533   for (i=0; i<mbs; i++) {
1534     n    = ii[i+1] - ii[i];
1535     idx  = ij + ii[i];
1536     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1537     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1538 
1539     for (j=0; j<n; j++) {
1540       xb = x + 15*(idx[j]);
1541       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1542       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1543 
1544       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1545       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1546       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1547       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1548       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1549       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1550       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1551       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1552       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1553       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1554       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1555       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1556       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1557       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1558       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1559       v     += 225;
1560     }
1561     if (usecprow) z = zarray + 15*ridx[i];
1562     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1563     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1564 
1565     if (!usecprow) z += 15;
1566   }
1567 
1568   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1569   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1570   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 
1575 /*
1576     This will not work with MatScalar == float because it calls the BLAS
1577 */
1578 #undef __FUNCT__
1579 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1580 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1581 {
1582   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1583   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1584   MatScalar      *v;
1585   PetscErrorCode ierr;
1586   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1587   PetscInt       ncols,k,*ridx=NULL;
1588   PetscBool      usecprow=a->compressedrow.use;
1589 
1590   PetscFunctionBegin;
1591   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1592   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1593 
1594   idx = a->j;
1595   v   = a->a;
1596   if (usecprow) {
1597     mbs  = a->compressedrow.nrows;
1598     ii   = a->compressedrow.i;
1599     ridx = a->compressedrow.rindex;
1600   } else {
1601     mbs = a->mbs;
1602     ii  = a->i;
1603     z   = zarray;
1604   }
1605 
1606   if (!a->mult_work) {
1607     k    = PetscMax(A->rmap->n,A->cmap->n);
1608     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1609   }
1610   work = a->mult_work;
1611   for (i=0; i<mbs; i++) {
1612     n           = ii[1] - ii[0]; ii++;
1613     ncols       = n*bs;
1614     workt       = work;
1615     for (j=0; j<n; j++) {
1616       xb = x + bs*(*idx++);
1617       for (k=0; k<bs; k++) workt[k] = xb[k];
1618       workt += bs;
1619     }
1620     if (usecprow) z = zarray + bs*ridx[i];
1621     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1622     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1623     v += n*bs2;
1624     if (!usecprow) z += bs;
1625   }
1626   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1627   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1628   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 #undef __FUNCT__
1633 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1634 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1635 {
1636   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1637   const PetscScalar *x;
1638   PetscScalar       *y,*z,sum;
1639   const MatScalar   *v;
1640   PetscErrorCode    ierr;
1641   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1642   const PetscInt    *idx,*ii;
1643   PetscBool         usecprow=a->compressedrow.use;
1644 
1645   PetscFunctionBegin;
1646   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1647   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1648   if (zz != yy) {
1649     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1650   } else {
1651     z = y;
1652   }
1653 
1654   idx = a->j;
1655   v   = a->a;
1656   if (usecprow) {
1657     if (zz != yy) {
1658       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1659     }
1660     mbs  = a->compressedrow.nrows;
1661     ii   = a->compressedrow.i;
1662     ridx = a->compressedrow.rindex;
1663   } else {
1664     ii = a->i;
1665   }
1666 
1667   for (i=0; i<mbs; i++) {
1668     n = ii[1] - ii[0];
1669     ii++;
1670     if (!usecprow) {
1671       sum         = y[i];
1672     } else {
1673       sum = y[ridx[i]];
1674     }
1675     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1676     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1677     PetscSparseDensePlusDot(sum,x,v,idx,n);
1678     v   += n;
1679     idx += n;
1680     if (usecprow) {
1681       z[ridx[i]] = sum;
1682     } else {
1683       z[i] = sum;
1684     }
1685   }
1686   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1687   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1688   if (zz != yy) {
1689     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1690   }
1691   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1692   PetscFunctionReturn(0);
1693 }
1694 
1695 #undef __FUNCT__
1696 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1697 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1698 {
1699   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1700   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1701   PetscScalar    x1,x2,*yarray,*zarray;
1702   MatScalar      *v;
1703   PetscErrorCode ierr;
1704   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1705   PetscBool      usecprow=a->compressedrow.use;
1706 
1707   PetscFunctionBegin;
1708   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1709   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1710   if (zz != yy) {
1711     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1712   } else {
1713     zarray = yarray;
1714   }
1715 
1716   idx = a->j;
1717   v   = a->a;
1718   if (usecprow) {
1719     if (zz != yy) {
1720       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1721     }
1722     mbs  = a->compressedrow.nrows;
1723     ii   = a->compressedrow.i;
1724     ridx = a->compressedrow.rindex;
1725     if (zz != yy) {
1726       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1727     }
1728   } else {
1729     ii = a->i;
1730     y  = yarray;
1731     z  = zarray;
1732   }
1733 
1734   for (i=0; i<mbs; i++) {
1735     n = ii[1] - ii[0]; ii++;
1736     if (usecprow) {
1737       z = zarray + 2*ridx[i];
1738       y = yarray + 2*ridx[i];
1739     }
1740     sum1 = y[0]; sum2 = y[1];
1741     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1742     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1743     for (j=0; j<n; j++) {
1744       xb = x + 2*(*idx++);
1745       x1 = xb[0];
1746       x2 = xb[1];
1747 
1748       sum1 += v[0]*x1 + v[2]*x2;
1749       sum2 += v[1]*x1 + v[3]*x2;
1750       v    += 4;
1751     }
1752     z[0] = sum1; z[1] = sum2;
1753     if (!usecprow) {
1754       z += 2; y += 2;
1755     }
1756   }
1757   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1758   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1759   if (zz != yy) {
1760     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1761   }
1762   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1763   PetscFunctionReturn(0);
1764 }
1765 
1766 #undef __FUNCT__
1767 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1768 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1769 {
1770   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1771   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1772   MatScalar      *v;
1773   PetscErrorCode ierr;
1774   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1775   PetscBool      usecprow=a->compressedrow.use;
1776 
1777   PetscFunctionBegin;
1778   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1779   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1780   if (zz != yy) {
1781     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1782   } else {
1783     zarray = yarray;
1784   }
1785 
1786   idx = a->j;
1787   v   = a->a;
1788   if (usecprow) {
1789     if (zz != yy) {
1790       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1791     }
1792     mbs  = a->compressedrow.nrows;
1793     ii   = a->compressedrow.i;
1794     ridx = a->compressedrow.rindex;
1795   } else {
1796     ii = a->i;
1797     y  = yarray;
1798     z  = zarray;
1799   }
1800 
1801   for (i=0; i<mbs; i++) {
1802     n = ii[1] - ii[0]; ii++;
1803     if (usecprow) {
1804       z = zarray + 3*ridx[i];
1805       y = yarray + 3*ridx[i];
1806     }
1807     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1808     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1809     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1810     for (j=0; j<n; j++) {
1811       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1812       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1813       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1814       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1815       v    += 9;
1816     }
1817     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1818     if (!usecprow) {
1819       z += 3; y += 3;
1820     }
1821   }
1822   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1823   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1824   if (zz != yy) {
1825     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1826   }
1827   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1828   PetscFunctionReturn(0);
1829 }
1830 
1831 #undef __FUNCT__
1832 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1833 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1834 {
1835   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1836   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1837   MatScalar      *v;
1838   PetscErrorCode ierr;
1839   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1840   PetscBool      usecprow=a->compressedrow.use;
1841 
1842   PetscFunctionBegin;
1843   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1844   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1845   if (zz != yy) {
1846     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1847   } else {
1848     zarray = yarray;
1849   }
1850 
1851   idx = a->j;
1852   v   = a->a;
1853   if (usecprow) {
1854     if (zz != yy) {
1855       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1856     }
1857     mbs  = a->compressedrow.nrows;
1858     ii   = a->compressedrow.i;
1859     ridx = a->compressedrow.rindex;
1860   } else {
1861     ii = a->i;
1862     y  = yarray;
1863     z  = zarray;
1864   }
1865 
1866   for (i=0; i<mbs; i++) {
1867     n = ii[1] - ii[0]; ii++;
1868     if (usecprow) {
1869       z = zarray + 4*ridx[i];
1870       y = yarray + 4*ridx[i];
1871     }
1872     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1873     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1874     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1875     for (j=0; j<n; j++) {
1876       xb    = x + 4*(*idx++);
1877       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1878       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1879       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1880       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1881       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1882       v    += 16;
1883     }
1884     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1885     if (!usecprow) {
1886       z += 4; y += 4;
1887     }
1888   }
1889   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1890   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1891   if (zz != yy) {
1892     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1893   }
1894   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1895   PetscFunctionReturn(0);
1896 }
1897 
1898 #undef __FUNCT__
1899 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1900 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1901 {
1902   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1903   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1904   PetscScalar    *yarray,*zarray;
1905   MatScalar      *v;
1906   PetscErrorCode ierr;
1907   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1908   PetscBool      usecprow=a->compressedrow.use;
1909 
1910   PetscFunctionBegin;
1911   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1912   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1913   if (zz != yy) {
1914     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1915   } else {
1916     zarray = yarray;
1917   }
1918 
1919   idx = a->j;
1920   v   = a->a;
1921   if (usecprow) {
1922     if (zz != yy) {
1923       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1924     }
1925     mbs  = a->compressedrow.nrows;
1926     ii   = a->compressedrow.i;
1927     ridx = a->compressedrow.rindex;
1928   } else {
1929     ii = a->i;
1930     y  = yarray;
1931     z  = zarray;
1932   }
1933 
1934   for (i=0; i<mbs; i++) {
1935     n = ii[1] - ii[0]; ii++;
1936     if (usecprow) {
1937       z = zarray + 5*ridx[i];
1938       y = yarray + 5*ridx[i];
1939     }
1940     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1941     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1942     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1943     for (j=0; j<n; j++) {
1944       xb    = x + 5*(*idx++);
1945       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1946       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1947       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1948       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1949       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1950       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1951       v    += 25;
1952     }
1953     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1954     if (!usecprow) {
1955       z += 5; y += 5;
1956     }
1957   }
1958   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1959   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1960   if (zz != yy) {
1961     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1962   }
1963   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1964   PetscFunctionReturn(0);
1965 }
1966 #undef __FUNCT__
1967 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1968 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1969 {
1970   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1971   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1972   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1973   MatScalar      *v;
1974   PetscErrorCode ierr;
1975   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1976   PetscBool      usecprow=a->compressedrow.use;
1977 
1978   PetscFunctionBegin;
1979   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1980   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1981   if (zz != yy) {
1982     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1983   } else {
1984     zarray = yarray;
1985   }
1986 
1987   idx = a->j;
1988   v   = a->a;
1989   if (usecprow) {
1990     if (zz != yy) {
1991       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1992     }
1993     mbs  = a->compressedrow.nrows;
1994     ii   = a->compressedrow.i;
1995     ridx = a->compressedrow.rindex;
1996   } else {
1997     ii = a->i;
1998     y  = yarray;
1999     z  = zarray;
2000   }
2001 
2002   for (i=0; i<mbs; i++) {
2003     n = ii[1] - ii[0]; ii++;
2004     if (usecprow) {
2005       z = zarray + 6*ridx[i];
2006       y = yarray + 6*ridx[i];
2007     }
2008     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2009     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2010     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2011     for (j=0; j<n; j++) {
2012       xb    = x + 6*(*idx++);
2013       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2014       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2015       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2016       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2017       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2018       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2019       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2020       v    += 36;
2021     }
2022     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2023     if (!usecprow) {
2024       z += 6; y += 6;
2025     }
2026   }
2027   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2028   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
2029   if (zz != yy) {
2030     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2031   }
2032   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
2033   PetscFunctionReturn(0);
2034 }
2035 
2036 #undef __FUNCT__
2037 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
2038 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2039 {
2040   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2041   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2042   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2043   MatScalar      *v;
2044   PetscErrorCode ierr;
2045   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
2046   PetscBool      usecprow=a->compressedrow.use;
2047 
2048   PetscFunctionBegin;
2049   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2050   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
2051   if (zz != yy) {
2052     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2053   } else {
2054     zarray = yarray;
2055   }
2056 
2057   idx = a->j;
2058   v   = a->a;
2059   if (usecprow) {
2060     if (zz != yy) {
2061       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2062     }
2063     mbs  = a->compressedrow.nrows;
2064     ii   = a->compressedrow.i;
2065     ridx = a->compressedrow.rindex;
2066   } else {
2067     ii = a->i;
2068     y  = yarray;
2069     z  = zarray;
2070   }
2071 
2072   for (i=0; i<mbs; i++) {
2073     n = ii[1] - ii[0]; ii++;
2074     if (usecprow) {
2075       z = zarray + 7*ridx[i];
2076       y = yarray + 7*ridx[i];
2077     }
2078     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2079     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2080     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2081     for (j=0; j<n; j++) {
2082       xb    = x + 7*(*idx++);
2083       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2084       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2085       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2086       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2087       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2088       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2089       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2090       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2091       v    += 49;
2092     }
2093     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2094     if (!usecprow) {
2095       z += 7; y += 7;
2096     }
2097   }
2098   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2099   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
2100   if (zz != yy) {
2101     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2102   }
2103   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
2104   PetscFunctionReturn(0);
2105 }
2106 
2107 #undef __FUNCT__
2108 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
2109 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2110 {
2111   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2112   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
2113   MatScalar      *v;
2114   PetscErrorCode ierr;
2115   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
2116   PetscInt       ncols,k,*ridx=NULL;
2117   PetscBool      usecprow=a->compressedrow.use;
2118 
2119   PetscFunctionBegin;
2120   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2121   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2122   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2123 
2124   idx = a->j;
2125   v   = a->a;
2126   if (usecprow) {
2127     mbs  = a->compressedrow.nrows;
2128     ii   = a->compressedrow.i;
2129     ridx = a->compressedrow.rindex;
2130   } else {
2131     mbs = a->mbs;
2132     ii  = a->i;
2133     z   = zarray;
2134   }
2135 
2136   if (!a->mult_work) {
2137     k    = PetscMax(A->rmap->n,A->cmap->n);
2138     ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
2139   }
2140   work = a->mult_work;
2141   for (i=0; i<mbs; i++) {
2142     n     = ii[1] - ii[0]; ii++;
2143     ncols = n*bs;
2144     workt = work;
2145     for (j=0; j<n; j++) {
2146       xb = x + bs*(*idx++);
2147       for (k=0; k<bs; k++) workt[k] = xb[k];
2148       workt += bs;
2149     }
2150     if (usecprow) z = zarray + bs*ridx[i];
2151     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2152     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
2153     v += n*bs2;
2154     if (!usecprow) z += bs;
2155   }
2156   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2157   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2158   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
2159   PetscFunctionReturn(0);
2160 }
2161 
2162 #undef __FUNCT__
2163 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
2164 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2165 {
2166   PetscScalar    zero = 0.0;
2167   PetscErrorCode ierr;
2168 
2169   PetscFunctionBegin;
2170   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2171   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2172   PetscFunctionReturn(0);
2173 }
2174 
2175 #undef __FUNCT__
2176 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
2177 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2178 {
2179   PetscScalar    zero = 0.0;
2180   PetscErrorCode ierr;
2181 
2182   PetscFunctionBegin;
2183   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2184   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2185   PetscFunctionReturn(0);
2186 }
2187 
2188 #undef __FUNCT__
2189 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
2190 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2191 {
2192   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2193   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
2194   MatScalar         *v;
2195   PetscErrorCode    ierr;
2196   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
2197   Mat_CompressedRow cprow   = a->compressedrow;
2198   PetscBool         usecprow=cprow.use;
2199 
2200   PetscFunctionBegin;
2201   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2202   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2203   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2204 
2205   idx = a->j;
2206   v   = a->a;
2207   if (usecprow) {
2208     mbs  = cprow.nrows;
2209     ii   = cprow.i;
2210     ridx = cprow.rindex;
2211   } else {
2212     mbs=a->mbs;
2213     ii = a->i;
2214     xb = x;
2215   }
2216 
2217   switch (bs) {
2218   case 1:
2219     for (i=0; i<mbs; i++) {
2220       if (usecprow) xb = x + ridx[i];
2221       x1 = xb[0];
2222       ib = idx + ii[0];
2223       n  = ii[1] - ii[0]; ii++;
2224       for (j=0; j<n; j++) {
2225         rval     = ib[j];
2226         z[rval] += PetscConj(*v) * x1;
2227         v++;
2228       }
2229       if (!usecprow) xb++;
2230     }
2231     break;
2232   case 2:
2233     for (i=0; i<mbs; i++) {
2234       if (usecprow) xb = x + 2*ridx[i];
2235       x1 = xb[0]; x2 = xb[1];
2236       ib = idx + ii[0];
2237       n  = ii[1] - ii[0]; ii++;
2238       for (j=0; j<n; j++) {
2239         rval       = ib[j]*2;
2240         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2241         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2242         v         += 4;
2243       }
2244       if (!usecprow) xb += 2;
2245     }
2246     break;
2247   case 3:
2248     for (i=0; i<mbs; i++) {
2249       if (usecprow) xb = x + 3*ridx[i];
2250       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2251       ib = idx + ii[0];
2252       n  = ii[1] - ii[0]; ii++;
2253       for (j=0; j<n; j++) {
2254         rval       = ib[j]*3;
2255         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2256         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2257         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2258         v         += 9;
2259       }
2260       if (!usecprow) xb += 3;
2261     }
2262     break;
2263   case 4:
2264     for (i=0; i<mbs; i++) {
2265       if (usecprow) xb = x + 4*ridx[i];
2266       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2267       ib = idx + ii[0];
2268       n  = ii[1] - ii[0]; ii++;
2269       for (j=0; j<n; j++) {
2270         rval       = ib[j]*4;
2271         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2272         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2273         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2274         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2275         v         += 16;
2276       }
2277       if (!usecprow) xb += 4;
2278     }
2279     break;
2280   case 5:
2281     for (i=0; i<mbs; i++) {
2282       if (usecprow) xb = x + 5*ridx[i];
2283       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2284       x4 = xb[3]; x5 = xb[4];
2285       ib = idx + ii[0];
2286       n  = ii[1] - ii[0]; ii++;
2287       for (j=0; j<n; j++) {
2288         rval       = ib[j]*5;
2289         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2290         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2291         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2292         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2293         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2294         v         += 25;
2295       }
2296       if (!usecprow) xb += 5;
2297     }
2298     break;
2299   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
2300     PetscInt    ncols,k;
2301     PetscScalar *work,*workt,*xtmp;
2302 
2303     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2304     if (!a->mult_work) {
2305       k    = PetscMax(A->rmap->n,A->cmap->n);
2306       ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
2307     }
2308     work = a->mult_work;
2309     xtmp = x;
2310     for (i=0; i<mbs; i++) {
2311       n     = ii[1] - ii[0]; ii++;
2312       ncols = n*bs;
2313       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2314       if (usecprow) xtmp = x + bs*ridx[i];
2315       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2316       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2317       v += n*bs2;
2318       if (!usecprow) xtmp += bs;
2319       workt = work;
2320       for (j=0; j<n; j++) {
2321         zb = z + bs*(*idx++);
2322         for (k=0; k<bs; k++) zb[k] += workt[k] ;
2323         workt += bs;
2324       }
2325     }
2326     }
2327   }
2328   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2329   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2330   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2331   PetscFunctionReturn(0);
2332 }
2333 
2334 #undef __FUNCT__
2335 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
2336 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2337 {
2338   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2339   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
2340   MatScalar         *v;
2341   PetscErrorCode    ierr;
2342   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
2343   Mat_CompressedRow cprow   = a->compressedrow;
2344   PetscBool         usecprow=cprow.use;
2345 
2346   PetscFunctionBegin;
2347   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2348   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2349   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2350 
2351   idx = a->j;
2352   v   = a->a;
2353   if (usecprow) {
2354     mbs  = cprow.nrows;
2355     ii   = cprow.i;
2356     ridx = cprow.rindex;
2357   } else {
2358     mbs=a->mbs;
2359     ii = a->i;
2360     xb = x;
2361   }
2362 
2363   switch (bs) {
2364   case 1:
2365     for (i=0; i<mbs; i++) {
2366       if (usecprow) xb = x + ridx[i];
2367       x1 = xb[0];
2368       ib = idx + ii[0];
2369       n  = ii[1] - ii[0]; ii++;
2370       for (j=0; j<n; j++) {
2371         rval     = ib[j];
2372         z[rval] += *v * x1;
2373         v++;
2374       }
2375       if (!usecprow) xb++;
2376     }
2377     break;
2378   case 2:
2379     for (i=0; i<mbs; i++) {
2380       if (usecprow) xb = x + 2*ridx[i];
2381       x1 = xb[0]; x2 = xb[1];
2382       ib = idx + ii[0];
2383       n  = ii[1] - ii[0]; ii++;
2384       for (j=0; j<n; j++) {
2385         rval       = ib[j]*2;
2386         z[rval++] += v[0]*x1 + v[1]*x2;
2387         z[rval++] += v[2]*x1 + v[3]*x2;
2388         v         += 4;
2389       }
2390       if (!usecprow) xb += 2;
2391     }
2392     break;
2393   case 3:
2394     for (i=0; i<mbs; i++) {
2395       if (usecprow) xb = x + 3*ridx[i];
2396       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2397       ib = idx + ii[0];
2398       n  = ii[1] - ii[0]; ii++;
2399       for (j=0; j<n; j++) {
2400         rval       = ib[j]*3;
2401         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2402         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2403         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2404         v         += 9;
2405       }
2406       if (!usecprow) xb += 3;
2407     }
2408     break;
2409   case 4:
2410     for (i=0; i<mbs; i++) {
2411       if (usecprow) xb = x + 4*ridx[i];
2412       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2413       ib = idx + ii[0];
2414       n  = ii[1] - ii[0]; ii++;
2415       for (j=0; j<n; j++) {
2416         rval       = ib[j]*4;
2417         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2418         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2419         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2420         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2421         v         += 16;
2422       }
2423       if (!usecprow) xb += 4;
2424     }
2425     break;
2426   case 5:
2427     for (i=0; i<mbs; i++) {
2428       if (usecprow) xb = x + 5*ridx[i];
2429       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2430       x4 = xb[3]; x5 = xb[4];
2431       ib = idx + ii[0];
2432       n  = ii[1] - ii[0]; ii++;
2433       for (j=0; j<n; j++) {
2434         rval       = ib[j]*5;
2435         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2436         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2437         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2438         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2439         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2440         v         += 25;
2441       }
2442       if (!usecprow) xb += 5;
2443     }
2444     break;
2445   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2446     PetscInt    ncols,k;
2447     PetscScalar *work,*workt,*xtmp;
2448 
2449     if (!a->mult_work) {
2450       k    = PetscMax(A->rmap->n,A->cmap->n);
2451       ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
2452     }
2453     work = a->mult_work;
2454     xtmp = x;
2455     for (i=0; i<mbs; i++) {
2456       n     = ii[1] - ii[0]; ii++;
2457       ncols = n*bs;
2458       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2459       if (usecprow) xtmp = x + bs*ridx[i];
2460       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2461       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2462       v += n*bs2;
2463       if (!usecprow) xtmp += bs;
2464       workt = work;
2465       for (j=0; j<n; j++) {
2466         zb = z + bs*(*idx++);
2467         for (k=0; k<bs; k++) zb[k] += workt[k];
2468         workt += bs;
2469       }
2470     }
2471     }
2472   }
2473   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2474   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2475   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2476   PetscFunctionReturn(0);
2477 }
2478 
2479 #undef __FUNCT__
2480 #define __FUNCT__ "MatScale_SeqBAIJ"
2481 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2482 {
2483   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2484   PetscInt       totalnz = a->bs2*a->nz;
2485   PetscScalar    oalpha  = alpha;
2486   PetscErrorCode ierr;
2487   PetscBLASInt   one = 1,tnz;
2488 
2489   PetscFunctionBegin;
2490   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2491   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2492   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2493   PetscFunctionReturn(0);
2494 }
2495 
2496 #undef __FUNCT__
2497 #define __FUNCT__ "MatNorm_SeqBAIJ"
2498 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2499 {
2500   PetscErrorCode ierr;
2501   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2502   MatScalar      *v  = a->a;
2503   PetscReal      sum = 0.0;
2504   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2505 
2506   PetscFunctionBegin;
2507   if (type == NORM_FROBENIUS) {
2508     for (i=0; i< bs2*nz; i++) {
2509       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2510     }
2511     *norm = PetscSqrtReal(sum);
2512   } else if (type == NORM_1) { /* maximum column sum */
2513     PetscReal *tmp;
2514     PetscInt  *bcol = a->j;
2515     ierr = PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
2516     ierr = PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));CHKERRQ(ierr);
2517     for (i=0; i<nz; i++) {
2518       for (j=0; j<bs; j++) {
2519         k1 = bs*(*bcol) + j; /* column index */
2520         for (k=0; k<bs; k++) {
2521           tmp[k1] += PetscAbsScalar(*v); v++;
2522         }
2523       }
2524       bcol++;
2525     }
2526     *norm = 0.0;
2527     for (j=0; j<A->cmap->n; j++) {
2528       if (tmp[j] > *norm) *norm = tmp[j];
2529     }
2530     ierr = PetscFree(tmp);CHKERRQ(ierr);
2531   } else if (type == NORM_INFINITY) { /* maximum row sum */
2532     *norm = 0.0;
2533     for (k=0; k<bs; k++) {
2534       for (j=0; j<a->mbs; j++) {
2535         v   = a->a + bs2*a->i[j] + k;
2536         sum = 0.0;
2537         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2538           for (k1=0; k1<bs; k1++) {
2539             sum += PetscAbsScalar(*v);
2540             v   += bs;
2541           }
2542         }
2543         if (sum > *norm) *norm = sum;
2544       }
2545     }
2546   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2547   PetscFunctionReturn(0);
2548 }
2549 
2550 
2551 #undef __FUNCT__
2552 #define __FUNCT__ "MatEqual_SeqBAIJ"
2553 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2554 {
2555   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2556   PetscErrorCode ierr;
2557 
2558   PetscFunctionBegin;
2559   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2560   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2561     *flg = PETSC_FALSE;
2562     PetscFunctionReturn(0);
2563   }
2564 
2565   /* if the a->i are the same */
2566   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2567   if (!*flg) PetscFunctionReturn(0);
2568 
2569   /* if a->j are the same */
2570   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2571   if (!*flg) PetscFunctionReturn(0);
2572 
2573   /* if a->a are the same */
2574   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2575   PetscFunctionReturn(0);
2576 
2577 }
2578 
2579 #undef __FUNCT__
2580 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
2581 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2582 {
2583   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2584   PetscErrorCode ierr;
2585   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2586   PetscScalar    *x,zero = 0.0;
2587   MatScalar      *aa,*aa_j;
2588 
2589   PetscFunctionBegin;
2590   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2591   bs   = A->rmap->bs;
2592   aa   = a->a;
2593   ai   = a->i;
2594   aj   = a->j;
2595   ambs = a->mbs;
2596   bs2  = a->bs2;
2597 
2598   ierr = VecSet(v,zero);CHKERRQ(ierr);
2599   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2600   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2601   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2602   for (i=0; i<ambs; i++) {
2603     for (j=ai[i]; j<ai[i+1]; j++) {
2604       if (aj[j] == i) {
2605         row  = i*bs;
2606         aa_j = aa+j*bs2;
2607         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2608         break;
2609       }
2610     }
2611   }
2612   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2613   PetscFunctionReturn(0);
2614 }
2615 
2616 #undef __FUNCT__
2617 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2618 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2619 {
2620   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2621   const PetscScalar *l,*r,*li,*ri;
2622   PetscScalar       x;
2623   MatScalar         *aa, *v;
2624   PetscErrorCode    ierr;
2625   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2626   const PetscInt    *ai,*aj;
2627 
2628   PetscFunctionBegin;
2629   ai  = a->i;
2630   aj  = a->j;
2631   aa  = a->a;
2632   m   = A->rmap->n;
2633   n   = A->cmap->n;
2634   bs  = A->rmap->bs;
2635   mbs = a->mbs;
2636   bs2 = a->bs2;
2637   if (ll) {
2638     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2639     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2640     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2641     for (i=0; i<mbs; i++) { /* for each block row */
2642       M  = ai[i+1] - ai[i];
2643       li = l + i*bs;
2644       v  = aa + bs2*ai[i];
2645       for (j=0; j<M; j++) { /* for each block */
2646         for (k=0; k<bs2; k++) {
2647           (*v++) *= li[k%bs];
2648         }
2649       }
2650     }
2651     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2652     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2653   }
2654 
2655   if (rr) {
2656     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2657     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2658     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2659     for (i=0; i<mbs; i++) { /* for each block row */
2660       iai = ai[i];
2661       M   = ai[i+1] - iai;
2662       v   = aa + bs2*iai;
2663       for (j=0; j<M; j++) { /* for each block */
2664         ri = r + bs*aj[iai+j];
2665         for (k=0; k<bs; k++) {
2666           x = ri[k];
2667           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2668           v += bs;
2669         }
2670       }
2671     }
2672     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2673     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2674   }
2675   PetscFunctionReturn(0);
2676 }
2677 
2678 
2679 #undef __FUNCT__
2680 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2681 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2682 {
2683   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2684 
2685   PetscFunctionBegin;
2686   info->block_size   = a->bs2;
2687   info->nz_allocated = a->bs2*a->maxnz;
2688   info->nz_used      = a->bs2*a->nz;
2689   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2690   info->assemblies   = A->num_ass;
2691   info->mallocs      = A->info.mallocs;
2692   info->memory       = ((PetscObject)A)->mem;
2693   if (A->factortype) {
2694     info->fill_ratio_given  = A->info.fill_ratio_given;
2695     info->fill_ratio_needed = A->info.fill_ratio_needed;
2696     info->factor_mallocs    = A->info.factor_mallocs;
2697   } else {
2698     info->fill_ratio_given  = 0;
2699     info->fill_ratio_needed = 0;
2700     info->factor_mallocs    = 0;
2701   }
2702   PetscFunctionReturn(0);
2703 }
2704 
2705 
2706 #if defined(PETSC_THREADCOMM_ACTIVE)
2707 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A)
2708 {
2709   PetscErrorCode ierr;
2710   PetscInt       *trstarts=A->rmap->trstarts;
2711   PetscInt       n,start,end;
2712   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
2713 
2714   start = trstarts[thread_id];
2715   end   = trstarts[thread_id+1];
2716   n     = a->i[end] - a->i[start];
2717   ierr  = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(PetscScalar));CHKERRQ(ierr);
2718   return 0;
2719 }
2720 
2721 #undef __FUNCT__
2722 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2723 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2724 {
2725   PetscErrorCode ierr;
2726 
2727 
2728   PetscFunctionBegin;
2729   ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr);
2730   PetscFunctionReturn(0);
2731 }
2732 #else
2733 #undef __FUNCT__
2734 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2735 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2736 {
2737   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2738   PetscErrorCode ierr;
2739 
2740   PetscFunctionBegin;
2741   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2742   PetscFunctionReturn(0);
2743 }
2744 #endif
2745