xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision df6f6c19b095f1abc1ac27fcbb9c2092a4a2a65e)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 #if defined(PETSC_THREADCOMM_ACTIVE)
7 #include <petscthreadcomm.h>
8 #endif
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
79 {
80   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
81   PetscErrorCode ierr;
82   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
83   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
84   const PetscInt *irow,*icol;
85   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
86   PetscInt       *aj = a->j,*ai = a->i;
87   MatScalar      *mat_a;
88   Mat            C;
89   PetscBool      flag,sorted;
90 
91   PetscFunctionBegin;
92   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
93   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
94 
95   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
96   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
97   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
98   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
99 
100   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
101   ssmap = smap;
102   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
103   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104   /* determine lens of each row */
105   for (i=0; i<nrows; i++) {
106     kstart  = ai[irow[i]];
107     kend    = kstart + a->ilen[irow[i]];
108     lens[i] = 0;
109     for (k=kstart; k<kend; k++) {
110       if (ssmap[aj[k]]) lens[i]++;
111     }
112   }
113   /* Create and fill new matrix */
114   if (scall == MAT_REUSE_MATRIX) {
115     c = (Mat_SeqBAIJ*)((*B)->data);
116 
117     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
119     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
120     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
121     C    = *B;
122   } else {
123     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
124     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
125     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
126     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
127   }
128   c = (Mat_SeqBAIJ*)(C->data);
129   for (i=0; i<nrows; i++) {
130     row      = irow[i];
131     kstart   = ai[row];
132     kend     = kstart + a->ilen[row];
133     mat_i    = c->i[i];
134     mat_j    = c->j + mat_i;
135     mat_a    = c->a + mat_i*bs2;
136     mat_ilen = c->ilen + i;
137     for (k=kstart; k<kend; k++) {
138       if ((tcol=ssmap[a->j[k]])) {
139         *mat_j++ = tcol - 1;
140         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
141         mat_a   += bs2;
142         (*mat_ilen)++;
143       }
144     }
145   }
146 
147   /* Free work space */
148   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
149   ierr = PetscFree(smap);CHKERRQ(ierr);
150   ierr = PetscFree(lens);CHKERRQ(ierr);
151   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
152   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
153 
154   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
155   *B   = C;
156   PetscFunctionReturn(0);
157 }
158 
159 #undef __FUNCT__
160 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
161 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
162 {
163   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
164   IS             is1,is2;
165   PetscErrorCode ierr;
166   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
167   const PetscInt *irow,*icol;
168 
169   PetscFunctionBegin;
170   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
171   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
172   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
173   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
174 
175   /* Verify if the indices corespond to each element in a block
176    and form the IS with compressed IS */
177   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
178   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
179   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
180   count = 0;
181   for (i=0; i<a->mbs; i++) {
182     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
183     if (vary[i]==bs) iary[count++] = i;
184   }
185   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
186 
187   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
188   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
189   count = 0;
190   for (i=0; i<a->mbs; i++) {
191     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
192     if (vary[i]==bs) iary[count++] = i;
193   }
194   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
195   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
196   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
197   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
198 
199   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
200   ierr = ISDestroy(&is1);CHKERRQ(ierr);
201   ierr = ISDestroy(&is2);CHKERRQ(ierr);
202   PetscFunctionReturn(0);
203 }
204 
205 #undef __FUNCT__
206 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
207 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
208 {
209   PetscErrorCode ierr;
210   PetscInt       i;
211 
212   PetscFunctionBegin;
213   if (scall == MAT_INITIAL_MATRIX) {
214     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
215   }
216 
217   for (i=0; i<n; i++) {
218     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
219   }
220   PetscFunctionReturn(0);
221 }
222 
223 
224 /* -------------------------------------------------------*/
225 /* Should check that shapes of vectors and matrices match */
226 /* -------------------------------------------------------*/
227 
228 #if defined(PETSC_THREADCOMM_ACTIVE)
229 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
230 {
231   PetscErrorCode    ierr;
232   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
233   PetscScalar       *z;
234   const PetscScalar *x;
235   const MatScalar   *aa;
236   PetscInt          *trstarts=A->rmap->trstarts;
237   PetscInt          n,start,end,i;
238   const PetscInt    *aj,*ai;
239   PetscScalar       sum;
240 
241   ierr  = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
242   ierr  = VecGetArray(zz,&z);CHKERRQ(ierr);
243   start = trstarts[thread_id];
244   end   = trstarts[thread_id+1];
245   ai    = a->i;
246   for (i=start; i<end; i++) {
247     n   = ai[i+1] - ai[i];
248     aj  = a->j + ai[i];
249     aa  = a->a + ai[i];
250     sum = 0.0;
251     PetscSparseDensePlusDot(sum,x,aa,aj,n);
252     z[i] = sum;
253   }
254   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
255   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
256   return 0;
257 }
258 
259 #undef __FUNCT__
260 #define __FUNCT__ "MatMult_SeqBAIJ_1"
261 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
262 {
263   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
264   PetscScalar       *z,sum;
265   const PetscScalar *x;
266   const MatScalar   *v;
267   PetscErrorCode    ierr;
268   PetscInt          mbs,i,n;
269   const PetscInt    *idx,*ii,*ridx=NULL;
270   PetscBool         usecprow=a->compressedrow.use;
271 
272   PetscFunctionBegin;
273 
274   if (usecprow) {
275     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
276     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
277     mbs  = a->compressedrow.nrows;
278     ii   = a->compressedrow.i;
279     ridx = a->compressedrow.rindex;
280     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
281     for (i=0; i<mbs; i++) {
282       n   = ii[i+1] - ii[i];
283       v   = a->a + ii[i];
284       idx = a->j + ii[i];
285       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
286       PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
287       sum = 0.0;
288       PetscSparseDensePlusDot(sum,x,v,idx,n);
289       z[ridx[i]] = sum;
290     }
291     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
292     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
293   } else {
294     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr);
295   }
296   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 #else
300 #undef __FUNCT__
301 #define __FUNCT__ "MatMult_SeqBAIJ_1"
302 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
303 {
304   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
305   PetscScalar       *z,sum;
306   const PetscScalar *x;
307   const MatScalar   *v;
308   PetscErrorCode    ierr;
309   PetscInt          mbs,i,n;
310   const PetscInt    *idx,*ii,*ridx=NULL;
311   PetscBool         usecprow=a->compressedrow.use;
312 
313   PetscFunctionBegin;
314   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
315   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
316 
317   if (usecprow) {
318     mbs  = a->compressedrow.nrows;
319     ii   = a->compressedrow.i;
320     ridx = a->compressedrow.rindex;
321     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
322   } else {
323     mbs = a->mbs;
324     ii  = a->i;
325   }
326 
327   for (i=0; i<mbs; i++) {
328     n   = ii[1] - ii[0];
329     v   = a->a + ii[0];
330     idx = a->j + ii[0];
331     ii++;
332     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
333     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
334     sum = 0.0;
335     PetscSparseDensePlusDot(sum,x,v,idx,n);
336     if (usecprow) {
337       z[ridx[i]] = sum;
338     } else {
339       z[i]        = sum;
340     }
341   }
342   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
343   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
344   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 #endif
348 
349 #if defined(PETSC_THREADCOMM_ACTIVE)
350 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
351 {
352   PetscErrorCode    ierr;
353   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
354   PetscScalar       *z,x1,x2,sum1,sum2;
355   const PetscScalar *x,*xb;
356   const MatScalar   *aa;
357   PetscInt          *trstarts=A->rmap->trstarts;
358   PetscInt          n,start,end,i,j;
359   const PetscInt    *aj,*ai;
360 
361   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
362   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
363   start  = trstarts[thread_id] / 2;
364   end    = trstarts[thread_id+1] / 2;
365   ai     = a->i;
366   for (i=start; i<end; i++) {
367     n    = ai[i+1] - ai[i];
368     aj   = a->j + ai[i];
369     aa   = a->a + ai[i]*4;
370     sum1 = 0.0; sum2 = 0.0;
371     for (j=0; j<n; j++) {
372       xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1];
373       sum1 += aa[4*j]*x1   + aa[4*j+2]*x2;
374       sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2;
375     }
376     z[2*i] = sum1; z[2*i+1] = sum2;
377   }
378   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
379   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
380   return 0;
381 }
382 
383 #undef __FUNCT__
384 #define __FUNCT__ "MatMult_SeqBAIJ_2"
385 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
386 {
387   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
388   PetscScalar       *z,x1,x2,sum1,sum2;
389   const PetscScalar *x,*xb;
390   const MatScalar   *v;
391   PetscErrorCode    ierr;
392   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
393   PetscBool         usecprow=a->compressedrow.use;
394 
395   PetscFunctionBegin;
396 
397   if (usecprow) {
398     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
399     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
400     mbs  = a->compressedrow.nrows;
401     ii   = a->compressedrow.i;
402     ridx = a->compressedrow.rindex;
403     for (i=0; i<mbs; i++) {
404       n    = ii[i+1] - ii[i];
405       idx  = a->j + ii[i];
406       v    = a->a + ii[i]*4;
407       sum1 = 0.0; sum2 = 0.0;
408       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
409       PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410       for (j=0; j<n; j++) {
411 	xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1];
412 	sum1 += v[4*j]*x1   + v[4*j+2]*x2;
413 	sum2 += v[4*j+1]*x1 + v[4*j+3]*x2;
414       }
415       z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2;
416     }
417     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
418     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
419   } else {
420     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr);
421   }
422   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
423   PetscFunctionReturn(0);
424 }
425 #else
426 #undef __FUNCT__
427 #define __FUNCT__ "MatMult_SeqBAIJ_2"
428 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
429 {
430   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
431   PetscScalar       *z = 0,sum1,sum2,*zarray;
432   const PetscScalar *x,*xb;
433   PetscScalar       x1,x2;
434   const MatScalar   *v;
435   PetscErrorCode    ierr;
436   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
437   PetscBool         usecprow=a->compressedrow.use;
438 
439   PetscFunctionBegin;
440   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
441   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
442 
443   idx = a->j;
444   v   = a->a;
445   if (usecprow) {
446     mbs  = a->compressedrow.nrows;
447     ii   = a->compressedrow.i;
448     ridx = a->compressedrow.rindex;
449   } else {
450     mbs = a->mbs;
451     ii  = a->i;
452     z   = zarray;
453   }
454 
455   for (i=0; i<mbs; i++) {
456     n           = ii[1] - ii[0]; ii++;
457     sum1        = 0.0; sum2 = 0.0;
458     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
459     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
460     for (j=0; j<n; j++) {
461       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
462       sum1 += v[0]*x1 + v[2]*x2;
463       sum2 += v[1]*x1 + v[3]*x2;
464       v    += 4;
465     }
466     if (usecprow) z = zarray + 2*ridx[i];
467     z[0] = sum1; z[1] = sum2;
468     if (!usecprow) z += 2;
469   }
470   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
471   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
472   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
473   PetscFunctionReturn(0);
474 }
475 #endif
476 
477 #if defined(PETSC_THREADCOMM_ACTIVE)
478 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
479 {
480   PetscErrorCode    ierr;
481   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
482   PetscScalar       *z,x1,x2,x3,sum1,sum2,sum3;
483   const PetscScalar *x,*xb;
484   const MatScalar   *aa;
485   PetscInt          *trstarts=A->rmap->trstarts;
486   PetscInt          n,start,end,i,j;
487   const PetscInt    *aj,*ai;
488 
489   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
490   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
491   start  = trstarts[thread_id] / 3;
492   end    = trstarts[thread_id+1] / 3;
493   ai     = a->i;
494   for (i=start; i<end; i++) {
495     n    = ai[i+1] - ai[i];
496     aj   = a->j + ai[i];
497     aa   = a->a + ai[i]*9;
498     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
499     for (j=0; j<n; j++) {
500       xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
501       sum1 += aa[9*j]*x1   + aa[9*j+3]*x2 + aa[9*j+6]*x3;
502       sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3;
503       sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3;
504     }
505     z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3;
506   }
507   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
508   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
509   return 0;
510 }
511 
512 #undef __FUNCT__
513 #define __FUNCT__ "MatMult_SeqBAIJ_3"
514 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
515 {
516   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
517   PetscScalar       *z,sum1,sum2,sum3,x1,x2,x3;
518   const PetscScalar *x,*xb;
519   const MatScalar   *v;
520   PetscErrorCode    ierr;
521   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
522   PetscBool         usecprow=a->compressedrow.use;
523 
524 
525 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
526 #pragma disjoint(*v,*z,*xb)
527 #endif
528 
529   PetscFunctionBegin;
530 
531   if (usecprow) {
532     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
533     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
534     mbs  = a->compressedrow.nrows;
535     ii   = a->compressedrow.i;
536     ridx = a->compressedrow.rindex;
537     for (i=0; i<mbs; i++) {
538       n    = ii[i+1] - ii[i];
539       idx  = a->j + ii[i];
540       v    = a->a + ii[i]*9;
541       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
542       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
543       PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
544       for (j=0; j<n; j++) {
545 	xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
546 	sum1 += v[9*j]*x1   + v[9*j+3]*x2 + v[9*j+6]*x3;
547 	sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3;
548 	sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3;
549       }
550       z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3;
551     }
552     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
553     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
554   } else {
555     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr);
556   }
557   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
558   PetscFunctionReturn(0);
559 }
560 #else
561 #undef __FUNCT__
562 #define __FUNCT__ "MatMult_SeqBAIJ_3"
563 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
564 {
565   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
566   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
567   const PetscScalar *x,*xb;
568   const MatScalar   *v;
569   PetscErrorCode    ierr;
570   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
571   PetscBool         usecprow=a->compressedrow.use;
572 
573 
574 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
575 #pragma disjoint(*v,*z,*xb)
576 #endif
577 
578   PetscFunctionBegin;
579   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
580   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
581 
582   idx = a->j;
583   v   = a->a;
584   if (usecprow) {
585     mbs  = a->compressedrow.nrows;
586     ii   = a->compressedrow.i;
587     ridx = a->compressedrow.rindex;
588   } else {
589     mbs = a->mbs;
590     ii  = a->i;
591     z   = zarray;
592   }
593 
594   for (i=0; i<mbs; i++) {
595     n           = ii[1] - ii[0]; ii++;
596     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
597     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
598     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
599     for (j=0; j<n; j++) {
600       xb = x + 3*(*idx++);
601       x1 = xb[0];
602       x2 = xb[1];
603       x3 = xb[2];
604 
605       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
606       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
607       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
608       v    += 9;
609     }
610     if (usecprow) z = zarray + 3*ridx[i];
611     z[0] = sum1; z[1] = sum2; z[2] = sum3;
612     if (!usecprow) z += 3;
613   }
614   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
615   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
616   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
617   PetscFunctionReturn(0);
618 }
619 #endif
620 
621 #if defined(PETSC_THREADCOMM_ACTIVE)
622 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
623 {
624   PetscErrorCode    ierr;
625   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
626   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
627   const PetscScalar *x,*xb;
628   const MatScalar   *aa;
629   PetscInt          *trstarts=A->rmap->trstarts;
630   PetscInt          n,start,end,i,j;
631   const PetscInt    *aj,*ai;
632 
633   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
634   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
635   start  = trstarts[thread_id] / 4;
636   end    = trstarts[thread_id+1] / 4;
637   ai     = a->i;
638   for (i=start; i<end; i++) {
639     n    = ai[i+1] - ai[i];
640     aj   = a->j + ai[i];
641     aa   = a->a + ai[i]*16;
642     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
643     for (j=0; j<n; j++) {
644       xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
645       sum1 += aa[16*j]*x1   + aa[16*j+4]*x2 + aa[16*j+8]*x3  + aa[16*j+12]*x4;
646       sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3  + aa[16*j+13]*x4;
647       sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4;
648       sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4;
649     }
650     z[4*i]   = sum1; z[4*i+1] = sum2;
651     z[4*i+2] = sum3; z[4*i+3] = sum4;
652   }
653   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
654   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
655   return 0;
656 }
657 
658 #undef __FUNCT__
659 #define __FUNCT__ "MatMult_SeqBAIJ_4"
660 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
661 {
662   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
663   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
664   const PetscScalar *x,*xb;
665   const MatScalar   *v;
666   PetscErrorCode    ierr;
667   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
668   PetscBool         usecprow=a->compressedrow.use;
669 
670   PetscFunctionBegin;
671 
672   if (usecprow) {
673     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
674     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
675     mbs  = a->compressedrow.nrows;
676     ii   = a->compressedrow.i;
677     ridx = a->compressedrow.rindex;
678     for (i=0; i<mbs; i++) {
679       n = ii[i+1] - ii[1];
680       idx  = a->j + ii[i];
681       v    = a->a + ii[i]*16;
682       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
683       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
684       PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
685       for (j=0; j<n; j++) {
686 	xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
687 	sum1 += v[16*j]*x1   + v[16*j+4]*x2 + v[16*j+8]*x3  + v[16*j+12]*x4;
688 	sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3  + v[16*j+13]*x4;
689 	sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4;
690 	sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4;
691       }
692       z[4*ridx[i]]   = sum1; z[4*ridx[i]+1] = sum2;
693       z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4;
694     }
695     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
696     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
697   } else {
698     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr);
699   }
700   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
701   PetscFunctionReturn(0);
702 }
703 #else
704 #undef __FUNCT__
705 #define __FUNCT__ "MatMult_SeqBAIJ_4"
706 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
707 {
708   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
709   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
710   const PetscScalar *x,*xb;
711   const MatScalar   *v;
712   PetscErrorCode    ierr;
713   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
714   PetscBool         usecprow=a->compressedrow.use;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
718   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
719 
720   idx = a->j;
721   v   = a->a;
722   if (usecprow) {
723     mbs  = a->compressedrow.nrows;
724     ii   = a->compressedrow.i;
725     ridx = a->compressedrow.rindex;
726   } else {
727     mbs = a->mbs;
728     ii  = a->i;
729     z   = zarray;
730   }
731 
732   for (i=0; i<mbs; i++) {
733     n = ii[1] - ii[0];
734     ii++;
735     sum1 = 0.0;
736     sum2 = 0.0;
737     sum3 = 0.0;
738     sum4 = 0.0;
739 
740     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
741     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
742     for (j=0; j<n; j++) {
743       xb    = x + 4*(*idx++);
744       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
745       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
746       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
747       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
748       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
749       v    += 16;
750     }
751     if (usecprow) z = zarray + 4*ridx[i];
752     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
753     if (!usecprow) z += 4;
754   }
755   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
756   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
757   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
758   PetscFunctionReturn(0);
759 }
760 #endif
761 
762 #if defined(PETSC_THREADCOMM_ACTIVE)
763 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
764 {
765   PetscErrorCode    ierr;
766   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
767   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
768   const PetscScalar *x,*xb;
769   const MatScalar   *aa;
770   PetscInt          *trstarts=A->rmap->trstarts;
771   PetscInt          n,start,end,i,j;
772   const PetscInt    *aj,*ai;
773 
774   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
775   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
776   start  = trstarts[thread_id] / 5;
777   end    = trstarts[thread_id+1] / 5;
778   ai     = a->i;
779   for (i=start; i<end; i++) {
780     n    = ai[i+1] - ai[i];
781     aj   = a->j + ai[i];
782     aa   = a->a + ai[i]*25;
783     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
784     for (j=0; j<n; j++) {
785       xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
786       sum1 += aa[25*j]*x1   + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5;
787       sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5;
788       sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5;
789       sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5;
790       sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5;
791     }
792     z[5*i]   = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3;
793     z[5*i+3] = sum4; z[5*i+4] = sum5;
794   }
795   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
796   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
797   return 0;
798 }
799 
800 #undef __FUNCT__
801 #define __FUNCT__ "MatMult_SeqBAIJ_5"
802 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
803 {
804   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
805   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
806   const PetscScalar *xb,*x;
807   const MatScalar   *v;
808   PetscErrorCode    ierr;
809   const PetscInt    *idx,*ii,*ridx=NULL;
810   PetscInt          mbs,i,j,n;
811   PetscBool         usecprow=a->compressedrow.use;
812 
813   PetscFunctionBegin;
814 
815   if (usecprow) {
816     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
817     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
818     mbs  = a->compressedrow.nrows;
819     ii   = a->compressedrow.i;
820     ridx = a->compressedrow.rindex;
821     for (i=0; i<mbs; i++) {
822       n    = ii[i+1] - ii[i];
823       idx  = a->j + ii[i];
824       v    = a->a + ii[i]*25;
825       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
826       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
827       PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
828       for (j=0; j<n; j++) {
829 	xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
830 	sum1 += v[25*j]*x1   + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5;
831 	sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5;
832 	sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5;
833 	sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5;
834 	sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5;
835       }
836       z[5*ridx[i]]   = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3;
837       z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5;
838     }
839     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
840     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
841   } else {
842     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr);
843   }
844   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
845   PetscFunctionReturn(0);
846 }
847 #else
848 #undef __FUNCT__
849 #define __FUNCT__ "MatMult_SeqBAIJ_5"
850 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
851 {
852   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
853   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
854   const PetscScalar *xb,*x;
855   const MatScalar   *v;
856   PetscErrorCode    ierr;
857   const PetscInt    *idx,*ii,*ridx=NULL;
858   PetscInt          mbs,i,j,n;
859   PetscBool         usecprow=a->compressedrow.use;
860 
861   PetscFunctionBegin;
862   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
863   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
864 
865   idx = a->j;
866   v   = a->a;
867   if (usecprow) {
868     mbs  = a->compressedrow.nrows;
869     ii   = a->compressedrow.i;
870     ridx = a->compressedrow.rindex;
871   } else {
872     mbs = a->mbs;
873     ii  = a->i;
874     z   = zarray;
875   }
876 
877   for (i=0; i<mbs; i++) {
878     n           = ii[1] - ii[0]; ii++;
879     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
880     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
881     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
882     for (j=0; j<n; j++) {
883       xb    = x + 5*(*idx++);
884       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
885       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
886       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
887       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
888       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
889       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
890       v    += 25;
891     }
892     if (usecprow) z = zarray + 5*ridx[i];
893     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
894     if (!usecprow) z += 5;
895   }
896   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
897   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
898   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
899   PetscFunctionReturn(0);
900 }
901 #endif
902 
903 
904 #if defined(PETSC_THREADCOMM_ACTIVE)
905 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
906 {
907   PetscErrorCode    ierr;
908   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
909   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
910   const PetscScalar *x,*xb;
911   const MatScalar   *aa;
912   PetscInt          *trstarts=A->rmap->trstarts;
913   PetscInt          n,start,end,i,j;
914   const PetscInt    *aj,*ai;
915 
916   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
917   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
918   start  = trstarts[thread_id] / 6;
919   end    = trstarts[thread_id+1] / 6;
920   ai     = a->i;
921   for (i=start; i<end; i++) {
922     n    = ai[i+1] - ai[i];
923     aj   = a->j + ai[i];
924     aa   = a->a + ai[i]*36;
925     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
926     for (j=0; j<n; j++) {
927       xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
928       sum1 += aa[36*j]*x1   + aa[36*j+6]*x2  + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6;
929       sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2  + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6;
930       sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2  + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6;
931       sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2  + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6;
932       sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6;
933       sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6;
934     }
935     z[6*i]   = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3;
936     z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6;
937   }
938   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
939   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
940   return 0;
941 }
942 
943 #undef __FUNCT__
944 #define __FUNCT__ "MatMult_SeqBAIJ_6"
945 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
946 {
947   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
948   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
949   const PetscScalar *x,*xb;
950   const MatScalar   *v;
951   PetscErrorCode    ierr;
952   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
953   PetscBool         usecprow=a->compressedrow.use;
954 
955   PetscFunctionBegin;
956 
957   if (usecprow) {
958     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
959     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
960     mbs  = a->compressedrow.nrows;
961     ii   = a->compressedrow.i;
962     ridx = a->compressedrow.rindex;
963     for (i=0; i<mbs; i++) {
964       n  = ii[i+1] - ii[i];
965       idx  = a->j + ii[i];
966       v    = a->a + ii[i]*36;
967       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
968       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
969       PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
970       for (j=0; j<n; j++) {
971 	xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
972 	sum1 += v[36*j]*x1   + v[36*j+6]*x2  + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6;
973 	sum2 += v[36*j+1]*x1 + v[36*j+7]*x2  + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6;
974 	sum3 += v[36*j+2]*x1 + v[36*j+8]*x2  + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6;
975 	sum4 += v[36*j+3]*x1 + v[36*j+9]*x2  + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6;
976 	sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6;
977 	sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6;
978       }
979       z[6*ridx[i]]   = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3;
980       z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6;
981     }
982     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
983     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
984   } else {
985     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr);
986   }
987   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
988   PetscFunctionReturn(0);
989 }
990 #else
991 #undef __FUNCT__
992 #define __FUNCT__ "MatMult_SeqBAIJ_6"
993 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
994 {
995   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
996   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
997   const PetscScalar *x,*xb;
998   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
999   const MatScalar   *v;
1000   PetscErrorCode    ierr;
1001   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1002   PetscBool         usecprow=a->compressedrow.use;
1003 
1004   PetscFunctionBegin;
1005   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1006   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1007 
1008   idx = a->j;
1009   v   = a->a;
1010   if (usecprow) {
1011     mbs  = a->compressedrow.nrows;
1012     ii   = a->compressedrow.i;
1013     ridx = a->compressedrow.rindex;
1014   } else {
1015     mbs = a->mbs;
1016     ii  = a->i;
1017     z   = zarray;
1018   }
1019 
1020   for (i=0; i<mbs; i++) {
1021     n  = ii[1] - ii[0];
1022     ii++;
1023     sum1 = 0.0;
1024     sum2 = 0.0;
1025     sum3 = 0.0;
1026     sum4 = 0.0;
1027     sum5 = 0.0;
1028     sum6 = 0.0;
1029 
1030     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1031     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1032     for (j=0; j<n; j++) {
1033       xb    = x + 6*(*idx++);
1034       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1035       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1036       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1037       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1038       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1039       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1040       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1041       v    += 36;
1042     }
1043     if (usecprow) z = zarray + 6*ridx[i];
1044     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1045     if (!usecprow) z += 6;
1046   }
1047 
1048   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1049   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1050   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
1051   PetscFunctionReturn(0);
1052 }
1053 #endif
1054 
1055 #if defined(PETSC_THREADCOMM_ACTIVE)
1056 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
1057 {
1058   PetscErrorCode    ierr;
1059   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1060   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1061   const PetscScalar *x,*xb;
1062   const MatScalar   *aa;
1063   PetscInt          *trstarts=A->rmap->trstarts;
1064   PetscInt          n,start,end,i,j;
1065   const PetscInt    *aj,*ai;
1066 
1067   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1068   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
1069   start  = trstarts[thread_id] / 7;
1070   end    = trstarts[thread_id+1] / 7;
1071   ai     = a->i;
1072   for (i=start; i<end; i++) {
1073     n    = ai[i+1] - ai[i];
1074     aj   = a->j + ai[i];
1075     aa   = a->a + ai[i]*49;
1076     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1077     for (j=0; j<n; j++) {
1078       xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1079       sum1 += aa[49*j]*x1   + aa[49*j+7]*x2  + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7;
1080       sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2  + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7;
1081       sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2  + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7;
1082       sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7;
1083       sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7;
1084       sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7;
1085       sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7;
1086     }
1087     z[7*i]   = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4;
1088     z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7;
1089   }
1090   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1091   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1092   return 0;
1093 }
1094 
1095 #undef __FUNCT__
1096 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1097 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1098 {
1099   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1100   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1101   const PetscScalar *x,*xb;
1102   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1103   const MatScalar   *v;
1104   PetscErrorCode    ierr;
1105   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1106   PetscBool         usecprow=a->compressedrow.use;
1107 
1108   PetscFunctionBegin;
1109 
1110   if (usecprow) {
1111     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1112     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1113     mbs  = a->compressedrow.nrows;
1114     ii   = a->compressedrow.i;
1115     ridx = a->compressedrow.rindex;
1116     for (i=0; i<mbs; i++) {
1117       n    = ii[i+1] - ii[i];
1118       idx  = a->j + ii[i];
1119       v    = a->a + ii[i]*49;
1120       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1121       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1122       PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1123       for (j=0; j<n; j++) {
1124 	xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1125 	sum1 += v[49*j]*x1   + v[49*j+7]*x2  + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7;
1126 	sum2 += v[49*j+1]*x1 + v[49*j+8]*x2  + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7;
1127 	sum3 += v[49*j+2]*x1 + v[49*j+9]*x2  + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7;
1128 	sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7;
1129 	sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7;
1130 	sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7;
1131 	sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7;
1132       }
1133       z[7*ridx[i]]   = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4;
1134       z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7;
1135     }
1136     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1137     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1138   } else {
1139     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1140   }
1141   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1142   PetscFunctionReturn(0);
1143 }
1144 #else
1145 #undef __FUNCT__
1146 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1147 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1148 {
1149   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1150   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1151   const PetscScalar *x,*xb;
1152   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1153   const MatScalar   *v;
1154   PetscErrorCode    ierr;
1155   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1156   PetscBool         usecprow=a->compressedrow.use;
1157 
1158   PetscFunctionBegin;
1159   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1160   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1161 
1162   idx = a->j;
1163   v   = a->a;
1164   if (usecprow) {
1165     mbs  = a->compressedrow.nrows;
1166     ii   = a->compressedrow.i;
1167     ridx = a->compressedrow.rindex;
1168   } else {
1169     mbs = a->mbs;
1170     ii  = a->i;
1171     z   = zarray;
1172   }
1173 
1174   for (i=0; i<mbs; i++) {
1175     n  = ii[1] - ii[0];
1176     ii++;
1177     sum1 = 0.0;
1178     sum2 = 0.0;
1179     sum3 = 0.0;
1180     sum4 = 0.0;
1181     sum5 = 0.0;
1182     sum6 = 0.0;
1183     sum7 = 0.0;
1184 
1185     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1186     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1187     for (j=0; j<n; j++) {
1188       xb    = x + 7*(*idx++);
1189       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1190       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1191       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1192       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1193       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1194       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1195       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1196       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1197       v    += 49;
1198     }
1199     if (usecprow) z = zarray + 7*ridx[i];
1200     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1201     if (!usecprow) z += 7;
1202   }
1203 
1204   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1205   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1206   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1207   PetscFunctionReturn(0);
1208 }
1209 #endif
1210 
1211 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1212 /* Default MatMult for block size 15 */
1213 
1214 #undef __FUNCT__
1215 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
1216 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1217 {
1218   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1219   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1220   const PetscScalar *x,*xb;
1221   PetscScalar       *zarray,xv;
1222   const MatScalar   *v;
1223   PetscErrorCode    ierr;
1224   const PetscInt    *ii,*ij=a->j,*idx;
1225   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1226   PetscBool         usecprow=a->compressedrow.use;
1227 
1228   PetscFunctionBegin;
1229   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1230   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1231 
1232   v = a->a;
1233   if (usecprow) {
1234     mbs  = a->compressedrow.nrows;
1235     ii   = a->compressedrow.i;
1236     ridx = a->compressedrow.rindex;
1237   } else {
1238     mbs = a->mbs;
1239     ii  = a->i;
1240     z   = zarray;
1241   }
1242 
1243   for (i=0; i<mbs; i++) {
1244     n    = ii[i+1] - ii[i];
1245     idx  = ij + ii[i];
1246     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1247     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1248 
1249     for (j=0; j<n; j++) {
1250       xb = x + 15*(idx[j]);
1251 
1252       for (k=0; k<15; k++) {
1253         xv     =  xb[k];
1254         sum1  += v[0]*xv;
1255         sum2  += v[1]*xv;
1256         sum3  += v[2]*xv;
1257         sum4  += v[3]*xv;
1258         sum5  += v[4]*xv;
1259         sum6  += v[5]*xv;
1260         sum7  += v[6]*xv;
1261         sum8  += v[7]*xv;
1262         sum9  += v[8]*xv;
1263         sum10 += v[9]*xv;
1264         sum11 += v[10]*xv;
1265         sum12 += v[11]*xv;
1266         sum13 += v[12]*xv;
1267         sum14 += v[13]*xv;
1268         sum15 += v[14]*xv;
1269         v     += 15;
1270       }
1271     }
1272     if (usecprow) z = zarray + 15*ridx[i];
1273     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1274     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1275 
1276     if (!usecprow) z += 15;
1277   }
1278 
1279   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1280   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1281   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1282   PetscFunctionReturn(0);
1283 }
1284 
1285 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1286 #undef __FUNCT__
1287 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
1288 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1289 {
1290   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1291   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1292   const PetscScalar *x,*xb;
1293   PetscScalar       x1,x2,x3,x4,*zarray;
1294   const MatScalar   *v;
1295   PetscErrorCode    ierr;
1296   const PetscInt    *ii,*ij=a->j,*idx;
1297   PetscInt          mbs,i,j,n,*ridx=NULL;
1298   PetscBool         usecprow=a->compressedrow.use;
1299 
1300   PetscFunctionBegin;
1301   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1302   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1303 
1304   v = a->a;
1305   if (usecprow) {
1306     mbs  = a->compressedrow.nrows;
1307     ii   = a->compressedrow.i;
1308     ridx = a->compressedrow.rindex;
1309   } else {
1310     mbs = a->mbs;
1311     ii  = a->i;
1312     z   = zarray;
1313   }
1314 
1315   for (i=0; i<mbs; i++) {
1316     n    = ii[i+1] - ii[i];
1317     idx  = ij + ii[i];
1318     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1319     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1320 
1321     for (j=0; j<n; j++) {
1322       xb = x + 15*(idx[j]);
1323       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1324 
1325       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1326       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1327       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1328       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1329       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1330       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1331       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1332       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1333       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1334       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1335       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1336       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1337       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1338       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1339       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1340 
1341       v += 60;
1342 
1343       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1344 
1345       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1346       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1347       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1348       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1349       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1350       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1351       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1352       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1353       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1354       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1355       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1356       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1357       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1358       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1359       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1360       v     += 60;
1361 
1362       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1363       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1364       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1365       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1366       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1367       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1368       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1369       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1370       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1371       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1372       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1373       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1374       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1375       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1376       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1377       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1378       v     += 60;
1379 
1380       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1381       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1382       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1383       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1384       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1385       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1386       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1387       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1388       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1389       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1390       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1391       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1392       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1393       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1394       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1395       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1396       v     += 45;
1397     }
1398     if (usecprow) z = zarray + 15*ridx[i];
1399     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1400     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1401 
1402     if (!usecprow) z += 15;
1403   }
1404 
1405   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1406   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1407   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1408   PetscFunctionReturn(0);
1409 }
1410 
1411 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1412 #undef __FUNCT__
1413 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
1414 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1415 {
1416   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1417   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1418   const PetscScalar *x,*xb;
1419   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1420   const MatScalar   *v;
1421   PetscErrorCode    ierr;
1422   const PetscInt    *ii,*ij=a->j,*idx;
1423   PetscInt          mbs,i,j,n,*ridx=NULL;
1424   PetscBool         usecprow=a->compressedrow.use;
1425 
1426   PetscFunctionBegin;
1427   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1428   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1429 
1430   v = a->a;
1431   if (usecprow) {
1432     mbs  = a->compressedrow.nrows;
1433     ii   = a->compressedrow.i;
1434     ridx = a->compressedrow.rindex;
1435   } else {
1436     mbs = a->mbs;
1437     ii  = a->i;
1438     z   = zarray;
1439   }
1440 
1441   for (i=0; i<mbs; i++) {
1442     n    = ii[i+1] - ii[i];
1443     idx  = ij + ii[i];
1444     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1445     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1446 
1447     for (j=0; j<n; j++) {
1448       xb = x + 15*(idx[j]);
1449       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1450       x8 = xb[7];
1451 
1452       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1453       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1454       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1455       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1456       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1457       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1458       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1459       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1460       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1461       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1462       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1463       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1464       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1465       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1466       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1467       v     += 120;
1468 
1469       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1470 
1471       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1472       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1473       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1474       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1475       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1476       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1477       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1478       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1479       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1480       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1481       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1482       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1483       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1484       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1485       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1486       v     += 105;
1487     }
1488     if (usecprow) z = zarray + 15*ridx[i];
1489     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1490     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1491 
1492     if (!usecprow) z += 15;
1493   }
1494 
1495   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1496   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1497   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1498   PetscFunctionReturn(0);
1499 }
1500 
1501 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1502 
1503 #undef __FUNCT__
1504 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
1505 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1506 {
1507   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1508   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1509   const PetscScalar *x,*xb;
1510   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1511   const MatScalar   *v;
1512   PetscErrorCode    ierr;
1513   const PetscInt    *ii,*ij=a->j,*idx;
1514   PetscInt          mbs,i,j,n,*ridx=NULL;
1515   PetscBool         usecprow=a->compressedrow.use;
1516 
1517   PetscFunctionBegin;
1518   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1519   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1520 
1521   v = a->a;
1522   if (usecprow) {
1523     mbs  = a->compressedrow.nrows;
1524     ii   = a->compressedrow.i;
1525     ridx = a->compressedrow.rindex;
1526   } else {
1527     mbs = a->mbs;
1528     ii  = a->i;
1529     z   = zarray;
1530   }
1531 
1532   for (i=0; i<mbs; i++) {
1533     n    = ii[i+1] - ii[i];
1534     idx  = ij + ii[i];
1535     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1536     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1537 
1538     for (j=0; j<n; j++) {
1539       xb = x + 15*(idx[j]);
1540       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1541       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1542 
1543       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1544       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1545       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1546       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1547       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1548       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1549       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1550       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1551       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1552       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1553       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1554       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1555       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1556       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1557       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1558       v     += 225;
1559     }
1560     if (usecprow) z = zarray + 15*ridx[i];
1561     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1562     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1563 
1564     if (!usecprow) z += 15;
1565   }
1566 
1567   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1568   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1569   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 
1574 /*
1575     This will not work with MatScalar == float because it calls the BLAS
1576 */
1577 #undef __FUNCT__
1578 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1579 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1580 {
1581   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1582   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1583   MatScalar      *v;
1584   PetscErrorCode ierr;
1585   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1586   PetscInt       ncols,k,*ridx=NULL;
1587   PetscBool      usecprow=a->compressedrow.use;
1588 
1589   PetscFunctionBegin;
1590   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1591   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1592 
1593   idx = a->j;
1594   v   = a->a;
1595   if (usecprow) {
1596     mbs  = a->compressedrow.nrows;
1597     ii   = a->compressedrow.i;
1598     ridx = a->compressedrow.rindex;
1599   } else {
1600     mbs = a->mbs;
1601     ii  = a->i;
1602     z   = zarray;
1603   }
1604 
1605   if (!a->mult_work) {
1606     k    = PetscMax(A->rmap->n,A->cmap->n);
1607     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1608   }
1609   work = a->mult_work;
1610   for (i=0; i<mbs; i++) {
1611     n           = ii[1] - ii[0]; ii++;
1612     ncols       = n*bs;
1613     workt       = work;
1614     for (j=0; j<n; j++) {
1615       xb = x + bs*(*idx++);
1616       for (k=0; k<bs; k++) workt[k] = xb[k];
1617       workt += bs;
1618     }
1619     if (usecprow) z = zarray + bs*ridx[i];
1620     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1621     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1622     v += n*bs2;
1623     if (!usecprow) z += bs;
1624   }
1625   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1626   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1627   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 #undef __FUNCT__
1632 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1633 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1634 {
1635   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1636   const PetscScalar *x;
1637   PetscScalar       *y,*z,sum;
1638   const MatScalar   *v;
1639   PetscErrorCode    ierr;
1640   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1641   const PetscInt    *idx,*ii;
1642   PetscBool         usecprow=a->compressedrow.use;
1643 
1644   PetscFunctionBegin;
1645   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1646   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1647   if (zz != yy) {
1648     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1649   } else {
1650     z = y;
1651   }
1652 
1653   idx = a->j;
1654   v   = a->a;
1655   if (usecprow) {
1656     if (zz != yy) {
1657       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1658     }
1659     mbs  = a->compressedrow.nrows;
1660     ii   = a->compressedrow.i;
1661     ridx = a->compressedrow.rindex;
1662   } else {
1663     ii = a->i;
1664   }
1665 
1666   for (i=0; i<mbs; i++) {
1667     n = ii[1] - ii[0];
1668     ii++;
1669     if (!usecprow) {
1670       sum         = y[i];
1671     } else {
1672       sum = y[ridx[i]];
1673     }
1674     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1675     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1676     PetscSparseDensePlusDot(sum,x,v,idx,n);
1677     v   += n;
1678     idx += n;
1679     if (usecprow) {
1680       z[ridx[i]] = sum;
1681     } else {
1682       z[i] = sum;
1683     }
1684   }
1685   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1686   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1687   if (zz != yy) {
1688     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1689   }
1690   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1691   PetscFunctionReturn(0);
1692 }
1693 
1694 #undef __FUNCT__
1695 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1696 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1697 {
1698   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1699   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1700   PetscScalar    x1,x2,*yarray,*zarray;
1701   MatScalar      *v;
1702   PetscErrorCode ierr;
1703   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1704   PetscBool      usecprow=a->compressedrow.use;
1705 
1706   PetscFunctionBegin;
1707   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1708   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1709   if (zz != yy) {
1710     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1711   } else {
1712     zarray = yarray;
1713   }
1714 
1715   idx = a->j;
1716   v   = a->a;
1717   if (usecprow) {
1718     if (zz != yy) {
1719       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1720     }
1721     mbs  = a->compressedrow.nrows;
1722     ii   = a->compressedrow.i;
1723     ridx = a->compressedrow.rindex;
1724     if (zz != yy) {
1725       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1726     }
1727   } else {
1728     ii = a->i;
1729     y  = yarray;
1730     z  = zarray;
1731   }
1732 
1733   for (i=0; i<mbs; i++) {
1734     n = ii[1] - ii[0]; ii++;
1735     if (usecprow) {
1736       z = zarray + 2*ridx[i];
1737       y = yarray + 2*ridx[i];
1738     }
1739     sum1 = y[0]; sum2 = y[1];
1740     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1741     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1742     for (j=0; j<n; j++) {
1743       xb = x + 2*(*idx++);
1744       x1 = xb[0];
1745       x2 = xb[1];
1746 
1747       sum1 += v[0]*x1 + v[2]*x2;
1748       sum2 += v[1]*x1 + v[3]*x2;
1749       v    += 4;
1750     }
1751     z[0] = sum1; z[1] = sum2;
1752     if (!usecprow) {
1753       z += 2; y += 2;
1754     }
1755   }
1756   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1757   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1758   if (zz != yy) {
1759     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1760   }
1761   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1762   PetscFunctionReturn(0);
1763 }
1764 
1765 #undef __FUNCT__
1766 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1767 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1768 {
1769   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1770   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1771   MatScalar      *v;
1772   PetscErrorCode ierr;
1773   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1774   PetscBool      usecprow=a->compressedrow.use;
1775 
1776   PetscFunctionBegin;
1777   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1778   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1779   if (zz != yy) {
1780     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1781   } else {
1782     zarray = yarray;
1783   }
1784 
1785   idx = a->j;
1786   v   = a->a;
1787   if (usecprow) {
1788     if (zz != yy) {
1789       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1790     }
1791     mbs  = a->compressedrow.nrows;
1792     ii   = a->compressedrow.i;
1793     ridx = a->compressedrow.rindex;
1794   } else {
1795     ii = a->i;
1796     y  = yarray;
1797     z  = zarray;
1798   }
1799 
1800   for (i=0; i<mbs; i++) {
1801     n = ii[1] - ii[0]; ii++;
1802     if (usecprow) {
1803       z = zarray + 3*ridx[i];
1804       y = yarray + 3*ridx[i];
1805     }
1806     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1807     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1808     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1809     for (j=0; j<n; j++) {
1810       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1811       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1812       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1813       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1814       v    += 9;
1815     }
1816     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1817     if (!usecprow) {
1818       z += 3; y += 3;
1819     }
1820   }
1821   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1822   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1823   if (zz != yy) {
1824     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1825   }
1826   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1827   PetscFunctionReturn(0);
1828 }
1829 
1830 #undef __FUNCT__
1831 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1832 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1833 {
1834   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1835   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1836   MatScalar      *v;
1837   PetscErrorCode ierr;
1838   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1839   PetscBool      usecprow=a->compressedrow.use;
1840 
1841   PetscFunctionBegin;
1842   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1843   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1844   if (zz != yy) {
1845     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1846   } else {
1847     zarray = yarray;
1848   }
1849 
1850   idx = a->j;
1851   v   = a->a;
1852   if (usecprow) {
1853     if (zz != yy) {
1854       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1855     }
1856     mbs  = a->compressedrow.nrows;
1857     ii   = a->compressedrow.i;
1858     ridx = a->compressedrow.rindex;
1859   } else {
1860     ii = a->i;
1861     y  = yarray;
1862     z  = zarray;
1863   }
1864 
1865   for (i=0; i<mbs; i++) {
1866     n = ii[1] - ii[0]; ii++;
1867     if (usecprow) {
1868       z = zarray + 4*ridx[i];
1869       y = yarray + 4*ridx[i];
1870     }
1871     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1872     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1873     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1874     for (j=0; j<n; j++) {
1875       xb    = x + 4*(*idx++);
1876       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1877       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1878       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1879       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1880       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1881       v    += 16;
1882     }
1883     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1884     if (!usecprow) {
1885       z += 4; y += 4;
1886     }
1887   }
1888   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1889   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1890   if (zz != yy) {
1891     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1892   }
1893   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1894   PetscFunctionReturn(0);
1895 }
1896 
1897 #undef __FUNCT__
1898 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1899 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1900 {
1901   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1902   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1903   PetscScalar    *yarray,*zarray;
1904   MatScalar      *v;
1905   PetscErrorCode ierr;
1906   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1907   PetscBool      usecprow=a->compressedrow.use;
1908 
1909   PetscFunctionBegin;
1910   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1911   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1912   if (zz != yy) {
1913     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1914   } else {
1915     zarray = yarray;
1916   }
1917 
1918   idx = a->j;
1919   v   = a->a;
1920   if (usecprow) {
1921     if (zz != yy) {
1922       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1923     }
1924     mbs  = a->compressedrow.nrows;
1925     ii   = a->compressedrow.i;
1926     ridx = a->compressedrow.rindex;
1927   } else {
1928     ii = a->i;
1929     y  = yarray;
1930     z  = zarray;
1931   }
1932 
1933   for (i=0; i<mbs; i++) {
1934     n = ii[1] - ii[0]; ii++;
1935     if (usecprow) {
1936       z = zarray + 5*ridx[i];
1937       y = yarray + 5*ridx[i];
1938     }
1939     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1940     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1941     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1942     for (j=0; j<n; j++) {
1943       xb    = x + 5*(*idx++);
1944       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1945       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1946       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1947       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1948       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1949       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1950       v    += 25;
1951     }
1952     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1953     if (!usecprow) {
1954       z += 5; y += 5;
1955     }
1956   }
1957   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1958   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
1959   if (zz != yy) {
1960     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1961   }
1962   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1963   PetscFunctionReturn(0);
1964 }
1965 #undef __FUNCT__
1966 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1967 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1968 {
1969   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1970   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1971   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1972   MatScalar      *v;
1973   PetscErrorCode ierr;
1974   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1975   PetscBool      usecprow=a->compressedrow.use;
1976 
1977   PetscFunctionBegin;
1978   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1979   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
1980   if (zz != yy) {
1981     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1982   } else {
1983     zarray = yarray;
1984   }
1985 
1986   idx = a->j;
1987   v   = a->a;
1988   if (usecprow) {
1989     if (zz != yy) {
1990       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1991     }
1992     mbs  = a->compressedrow.nrows;
1993     ii   = a->compressedrow.i;
1994     ridx = a->compressedrow.rindex;
1995   } else {
1996     ii = a->i;
1997     y  = yarray;
1998     z  = zarray;
1999   }
2000 
2001   for (i=0; i<mbs; i++) {
2002     n = ii[1] - ii[0]; ii++;
2003     if (usecprow) {
2004       z = zarray + 6*ridx[i];
2005       y = yarray + 6*ridx[i];
2006     }
2007     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2008     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2009     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2010     for (j=0; j<n; j++) {
2011       xb    = x + 6*(*idx++);
2012       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2013       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2014       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2015       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2016       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2017       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2018       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2019       v    += 36;
2020     }
2021     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2022     if (!usecprow) {
2023       z += 6; y += 6;
2024     }
2025   }
2026   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2027   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
2028   if (zz != yy) {
2029     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2030   }
2031   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
2032   PetscFunctionReturn(0);
2033 }
2034 
2035 #undef __FUNCT__
2036 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
2037 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2038 {
2039   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2040   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2041   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2042   MatScalar      *v;
2043   PetscErrorCode ierr;
2044   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
2045   PetscBool      usecprow=a->compressedrow.use;
2046 
2047   PetscFunctionBegin;
2048   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2049   ierr = VecGetArray(yy,&yarray);CHKERRQ(ierr);
2050   if (zz != yy) {
2051     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2052   } else {
2053     zarray = yarray;
2054   }
2055 
2056   idx = a->j;
2057   v   = a->a;
2058   if (usecprow) {
2059     if (zz != yy) {
2060       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2061     }
2062     mbs  = a->compressedrow.nrows;
2063     ii   = a->compressedrow.i;
2064     ridx = a->compressedrow.rindex;
2065   } else {
2066     ii = a->i;
2067     y  = yarray;
2068     z  = zarray;
2069   }
2070 
2071   for (i=0; i<mbs; i++) {
2072     n = ii[1] - ii[0]; ii++;
2073     if (usecprow) {
2074       z = zarray + 7*ridx[i];
2075       y = yarray + 7*ridx[i];
2076     }
2077     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2078     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2079     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2080     for (j=0; j<n; j++) {
2081       xb    = x + 7*(*idx++);
2082       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2083       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2084       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2085       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2086       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2087       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2088       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2089       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2090       v    += 49;
2091     }
2092     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2093     if (!usecprow) {
2094       z += 7; y += 7;
2095     }
2096   }
2097   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2098   ierr = VecRestoreArray(yy,&yarray);CHKERRQ(ierr);
2099   if (zz != yy) {
2100     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2101   }
2102   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
2103   PetscFunctionReturn(0);
2104 }
2105 
2106 #undef __FUNCT__
2107 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
2108 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2109 {
2110   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2111   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
2112   MatScalar      *v;
2113   PetscErrorCode ierr;
2114   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
2115   PetscInt       ncols,k,*ridx=NULL;
2116   PetscBool      usecprow=a->compressedrow.use;
2117 
2118   PetscFunctionBegin;
2119   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2120   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2121   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2122 
2123   idx = a->j;
2124   v   = a->a;
2125   if (usecprow) {
2126     mbs  = a->compressedrow.nrows;
2127     ii   = a->compressedrow.i;
2128     ridx = a->compressedrow.rindex;
2129   } else {
2130     mbs = a->mbs;
2131     ii  = a->i;
2132     z   = zarray;
2133   }
2134 
2135   if (!a->mult_work) {
2136     k    = PetscMax(A->rmap->n,A->cmap->n);
2137     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2138   }
2139   work = a->mult_work;
2140   for (i=0; i<mbs; i++) {
2141     n     = ii[1] - ii[0]; ii++;
2142     ncols = n*bs;
2143     workt = work;
2144     for (j=0; j<n; j++) {
2145       xb = x + bs*(*idx++);
2146       for (k=0; k<bs; k++) workt[k] = xb[k];
2147       workt += bs;
2148     }
2149     if (usecprow) z = zarray + bs*ridx[i];
2150     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2151     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
2152     v += n*bs2;
2153     if (!usecprow) z += bs;
2154   }
2155   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2156   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2157   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
2158   PetscFunctionReturn(0);
2159 }
2160 
2161 #undef __FUNCT__
2162 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
2163 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2164 {
2165   PetscScalar    zero = 0.0;
2166   PetscErrorCode ierr;
2167 
2168   PetscFunctionBegin;
2169   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2170   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2171   PetscFunctionReturn(0);
2172 }
2173 
2174 #undef __FUNCT__
2175 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
2176 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2177 {
2178   PetscScalar    zero = 0.0;
2179   PetscErrorCode ierr;
2180 
2181   PetscFunctionBegin;
2182   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2183   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2184   PetscFunctionReturn(0);
2185 }
2186 
2187 #undef __FUNCT__
2188 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
2189 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2190 {
2191   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2192   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
2193   MatScalar         *v;
2194   PetscErrorCode    ierr;
2195   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
2196   Mat_CompressedRow cprow   = a->compressedrow;
2197   PetscBool         usecprow=cprow.use;
2198 
2199   PetscFunctionBegin;
2200   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2201   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2202   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2203 
2204   idx = a->j;
2205   v   = a->a;
2206   if (usecprow) {
2207     mbs  = cprow.nrows;
2208     ii   = cprow.i;
2209     ridx = cprow.rindex;
2210   } else {
2211     mbs=a->mbs;
2212     ii = a->i;
2213     xb = x;
2214   }
2215 
2216   switch (bs) {
2217   case 1:
2218     for (i=0; i<mbs; i++) {
2219       if (usecprow) xb = x + ridx[i];
2220       x1 = xb[0];
2221       ib = idx + ii[0];
2222       n  = ii[1] - ii[0]; ii++;
2223       for (j=0; j<n; j++) {
2224         rval     = ib[j];
2225         z[rval] += PetscConj(*v) * x1;
2226         v++;
2227       }
2228       if (!usecprow) xb++;
2229     }
2230     break;
2231   case 2:
2232     for (i=0; i<mbs; i++) {
2233       if (usecprow) xb = x + 2*ridx[i];
2234       x1 = xb[0]; x2 = xb[1];
2235       ib = idx + ii[0];
2236       n  = ii[1] - ii[0]; ii++;
2237       for (j=0; j<n; j++) {
2238         rval       = ib[j]*2;
2239         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2240         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2241         v         += 4;
2242       }
2243       if (!usecprow) xb += 2;
2244     }
2245     break;
2246   case 3:
2247     for (i=0; i<mbs; i++) {
2248       if (usecprow) xb = x + 3*ridx[i];
2249       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2250       ib = idx + ii[0];
2251       n  = ii[1] - ii[0]; ii++;
2252       for (j=0; j<n; j++) {
2253         rval       = ib[j]*3;
2254         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2255         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2256         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2257         v         += 9;
2258       }
2259       if (!usecprow) xb += 3;
2260     }
2261     break;
2262   case 4:
2263     for (i=0; i<mbs; i++) {
2264       if (usecprow) xb = x + 4*ridx[i];
2265       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2266       ib = idx + ii[0];
2267       n  = ii[1] - ii[0]; ii++;
2268       for (j=0; j<n; j++) {
2269         rval       = ib[j]*4;
2270         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2271         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2272         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2273         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2274         v         += 16;
2275       }
2276       if (!usecprow) xb += 4;
2277     }
2278     break;
2279   case 5:
2280     for (i=0; i<mbs; i++) {
2281       if (usecprow) xb = x + 5*ridx[i];
2282       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2283       x4 = xb[3]; x5 = xb[4];
2284       ib = idx + ii[0];
2285       n  = ii[1] - ii[0]; ii++;
2286       for (j=0; j<n; j++) {
2287         rval       = ib[j]*5;
2288         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2289         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2290         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2291         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2292         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2293         v         += 25;
2294       }
2295       if (!usecprow) xb += 5;
2296     }
2297     break;
2298   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
2299     PetscInt    ncols,k;
2300     PetscScalar *work,*workt,*xtmp;
2301 
2302     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2303     if (!a->mult_work) {
2304       k    = PetscMax(A->rmap->n,A->cmap->n);
2305       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2306     }
2307     work = a->mult_work;
2308     xtmp = x;
2309     for (i=0; i<mbs; i++) {
2310       n     = ii[1] - ii[0]; ii++;
2311       ncols = n*bs;
2312       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2313       if (usecprow) xtmp = x + bs*ridx[i];
2314       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2315       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2316       v += n*bs2;
2317       if (!usecprow) xtmp += bs;
2318       workt = work;
2319       for (j=0; j<n; j++) {
2320         zb = z + bs*(*idx++);
2321         for (k=0; k<bs; k++) zb[k] += workt[k] ;
2322         workt += bs;
2323       }
2324     }
2325     }
2326   }
2327   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2328   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2329   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2330   PetscFunctionReturn(0);
2331 }
2332 
2333 #undef __FUNCT__
2334 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
2335 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2336 {
2337   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2338   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
2339   MatScalar         *v;
2340   PetscErrorCode    ierr;
2341   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
2342   Mat_CompressedRow cprow   = a->compressedrow;
2343   PetscBool         usecprow=cprow.use;
2344 
2345   PetscFunctionBegin;
2346   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2347   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2348   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2349 
2350   idx = a->j;
2351   v   = a->a;
2352   if (usecprow) {
2353     mbs  = cprow.nrows;
2354     ii   = cprow.i;
2355     ridx = cprow.rindex;
2356   } else {
2357     mbs=a->mbs;
2358     ii = a->i;
2359     xb = x;
2360   }
2361 
2362   switch (bs) {
2363   case 1:
2364     for (i=0; i<mbs; i++) {
2365       if (usecprow) xb = x + ridx[i];
2366       x1 = xb[0];
2367       ib = idx + ii[0];
2368       n  = ii[1] - ii[0]; ii++;
2369       for (j=0; j<n; j++) {
2370         rval     = ib[j];
2371         z[rval] += *v * x1;
2372         v++;
2373       }
2374       if (!usecprow) xb++;
2375     }
2376     break;
2377   case 2:
2378     for (i=0; i<mbs; i++) {
2379       if (usecprow) xb = x + 2*ridx[i];
2380       x1 = xb[0]; x2 = xb[1];
2381       ib = idx + ii[0];
2382       n  = ii[1] - ii[0]; ii++;
2383       for (j=0; j<n; j++) {
2384         rval       = ib[j]*2;
2385         z[rval++] += v[0]*x1 + v[1]*x2;
2386         z[rval++] += v[2]*x1 + v[3]*x2;
2387         v         += 4;
2388       }
2389       if (!usecprow) xb += 2;
2390     }
2391     break;
2392   case 3:
2393     for (i=0; i<mbs; i++) {
2394       if (usecprow) xb = x + 3*ridx[i];
2395       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2396       ib = idx + ii[0];
2397       n  = ii[1] - ii[0]; ii++;
2398       for (j=0; j<n; j++) {
2399         rval       = ib[j]*3;
2400         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2401         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2402         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2403         v         += 9;
2404       }
2405       if (!usecprow) xb += 3;
2406     }
2407     break;
2408   case 4:
2409     for (i=0; i<mbs; i++) {
2410       if (usecprow) xb = x + 4*ridx[i];
2411       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2412       ib = idx + ii[0];
2413       n  = ii[1] - ii[0]; ii++;
2414       for (j=0; j<n; j++) {
2415         rval       = ib[j]*4;
2416         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2417         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2418         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2419         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2420         v         += 16;
2421       }
2422       if (!usecprow) xb += 4;
2423     }
2424     break;
2425   case 5:
2426     for (i=0; i<mbs; i++) {
2427       if (usecprow) xb = x + 5*ridx[i];
2428       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2429       x4 = xb[3]; x5 = xb[4];
2430       ib = idx + ii[0];
2431       n  = ii[1] - ii[0]; ii++;
2432       for (j=0; j<n; j++) {
2433         rval       = ib[j]*5;
2434         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2435         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2436         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2437         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2438         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2439         v         += 25;
2440       }
2441       if (!usecprow) xb += 5;
2442     }
2443     break;
2444   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2445     PetscInt    ncols,k;
2446     PetscScalar *work,*workt,*xtmp;
2447 
2448     if (!a->mult_work) {
2449       k    = PetscMax(A->rmap->n,A->cmap->n);
2450       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2451     }
2452     work = a->mult_work;
2453     xtmp = x;
2454     for (i=0; i<mbs; i++) {
2455       n     = ii[1] - ii[0]; ii++;
2456       ncols = n*bs;
2457       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2458       if (usecprow) xtmp = x + bs*ridx[i];
2459       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2460       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2461       v += n*bs2;
2462       if (!usecprow) xtmp += bs;
2463       workt = work;
2464       for (j=0; j<n; j++) {
2465         zb = z + bs*(*idx++);
2466         for (k=0; k<bs; k++) zb[k] += workt[k];
2467         workt += bs;
2468       }
2469     }
2470     }
2471   }
2472   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2473   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2474   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2475   PetscFunctionReturn(0);
2476 }
2477 
2478 #undef __FUNCT__
2479 #define __FUNCT__ "MatScale_SeqBAIJ"
2480 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2481 {
2482   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2483   PetscInt       totalnz = a->bs2*a->nz;
2484   PetscScalar    oalpha  = alpha;
2485   PetscErrorCode ierr;
2486   PetscBLASInt   one = 1,tnz;
2487 
2488   PetscFunctionBegin;
2489   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2490   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2491   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2492   PetscFunctionReturn(0);
2493 }
2494 
2495 #undef __FUNCT__
2496 #define __FUNCT__ "MatNorm_SeqBAIJ"
2497 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2498 {
2499   PetscErrorCode ierr;
2500   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2501   MatScalar      *v  = a->a;
2502   PetscReal      sum = 0.0;
2503   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2504 
2505   PetscFunctionBegin;
2506   if (type == NORM_FROBENIUS) {
2507     for (i=0; i< bs2*nz; i++) {
2508       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2509     }
2510     *norm = PetscSqrtReal(sum);
2511   } else if (type == NORM_1) { /* maximum column sum */
2512     PetscReal *tmp;
2513     PetscInt  *bcol = a->j;
2514     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2515     for (i=0; i<nz; i++) {
2516       for (j=0; j<bs; j++) {
2517         k1 = bs*(*bcol) + j; /* column index */
2518         for (k=0; k<bs; k++) {
2519           tmp[k1] += PetscAbsScalar(*v); v++;
2520         }
2521       }
2522       bcol++;
2523     }
2524     *norm = 0.0;
2525     for (j=0; j<A->cmap->n; j++) {
2526       if (tmp[j] > *norm) *norm = tmp[j];
2527     }
2528     ierr = PetscFree(tmp);CHKERRQ(ierr);
2529   } else if (type == NORM_INFINITY) { /* maximum row sum */
2530     *norm = 0.0;
2531     for (k=0; k<bs; k++) {
2532       for (j=0; j<a->mbs; j++) {
2533         v   = a->a + bs2*a->i[j] + k;
2534         sum = 0.0;
2535         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2536           for (k1=0; k1<bs; k1++) {
2537             sum += PetscAbsScalar(*v);
2538             v   += bs;
2539           }
2540         }
2541         if (sum > *norm) *norm = sum;
2542       }
2543     }
2544   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2545   PetscFunctionReturn(0);
2546 }
2547 
2548 
2549 #undef __FUNCT__
2550 #define __FUNCT__ "MatEqual_SeqBAIJ"
2551 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2552 {
2553   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2554   PetscErrorCode ierr;
2555 
2556   PetscFunctionBegin;
2557   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2558   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2559     *flg = PETSC_FALSE;
2560     PetscFunctionReturn(0);
2561   }
2562 
2563   /* if the a->i are the same */
2564   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2565   if (!*flg) PetscFunctionReturn(0);
2566 
2567   /* if a->j are the same */
2568   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2569   if (!*flg) PetscFunctionReturn(0);
2570 
2571   /* if a->a are the same */
2572   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2573   PetscFunctionReturn(0);
2574 
2575 }
2576 
2577 #undef __FUNCT__
2578 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
2579 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2580 {
2581   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2582   PetscErrorCode ierr;
2583   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2584   PetscScalar    *x,zero = 0.0;
2585   MatScalar      *aa,*aa_j;
2586 
2587   PetscFunctionBegin;
2588   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2589   bs   = A->rmap->bs;
2590   aa   = a->a;
2591   ai   = a->i;
2592   aj   = a->j;
2593   ambs = a->mbs;
2594   bs2  = a->bs2;
2595 
2596   ierr = VecSet(v,zero);CHKERRQ(ierr);
2597   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2598   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2599   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2600   for (i=0; i<ambs; i++) {
2601     for (j=ai[i]; j<ai[i+1]; j++) {
2602       if (aj[j] == i) {
2603         row  = i*bs;
2604         aa_j = aa+j*bs2;
2605         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2606         break;
2607       }
2608     }
2609   }
2610   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2611   PetscFunctionReturn(0);
2612 }
2613 
2614 #undef __FUNCT__
2615 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2616 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2617 {
2618   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2619   const PetscScalar *l,*r,*li,*ri;
2620   PetscScalar       x;
2621   MatScalar         *aa, *v;
2622   PetscErrorCode    ierr;
2623   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2624   const PetscInt    *ai,*aj;
2625 
2626   PetscFunctionBegin;
2627   ai  = a->i;
2628   aj  = a->j;
2629   aa  = a->a;
2630   m   = A->rmap->n;
2631   n   = A->cmap->n;
2632   bs  = A->rmap->bs;
2633   mbs = a->mbs;
2634   bs2 = a->bs2;
2635   if (ll) {
2636     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2637     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2638     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2639     for (i=0; i<mbs; i++) { /* for each block row */
2640       M  = ai[i+1] - ai[i];
2641       li = l + i*bs;
2642       v  = aa + bs2*ai[i];
2643       for (j=0; j<M; j++) { /* for each block */
2644         for (k=0; k<bs2; k++) {
2645           (*v++) *= li[k%bs];
2646         }
2647       }
2648     }
2649     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2650     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2651   }
2652 
2653   if (rr) {
2654     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2655     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2656     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2657     for (i=0; i<mbs; i++) { /* for each block row */
2658       iai = ai[i];
2659       M   = ai[i+1] - iai;
2660       v   = aa + bs2*iai;
2661       for (j=0; j<M; j++) { /* for each block */
2662         ri = r + bs*aj[iai+j];
2663         for (k=0; k<bs; k++) {
2664           x = ri[k];
2665           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2666           v += bs;
2667         }
2668       }
2669     }
2670     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2671     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2672   }
2673   PetscFunctionReturn(0);
2674 }
2675 
2676 
2677 #undef __FUNCT__
2678 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2679 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2680 {
2681   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2682 
2683   PetscFunctionBegin;
2684   info->block_size   = a->bs2;
2685   info->nz_allocated = a->bs2*a->maxnz;
2686   info->nz_used      = a->bs2*a->nz;
2687   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2688   info->assemblies   = A->num_ass;
2689   info->mallocs      = A->info.mallocs;
2690   info->memory       = ((PetscObject)A)->mem;
2691   if (A->factortype) {
2692     info->fill_ratio_given  = A->info.fill_ratio_given;
2693     info->fill_ratio_needed = A->info.fill_ratio_needed;
2694     info->factor_mallocs    = A->info.factor_mallocs;
2695   } else {
2696     info->fill_ratio_given  = 0;
2697     info->fill_ratio_needed = 0;
2698     info->factor_mallocs    = 0;
2699   }
2700   PetscFunctionReturn(0);
2701 }
2702 
2703 
2704 #if defined(PETSC_THREADCOMM_ACTIVE)
2705 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A)
2706 {
2707   PetscErrorCode ierr;
2708   PetscInt       *trstarts=A->rmap->trstarts;
2709   PetscInt       n,start,end;
2710   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
2711 
2712   start = trstarts[thread_id]/A->rmap->bs;
2713   end   = trstarts[thread_id+1]/A->rmap->bs;
2714   n     = a->i[end] - a->i[start];
2715   ierr  = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
2716   return 0;
2717 }
2718 
2719 #undef __FUNCT__
2720 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2721 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2722 {
2723   PetscErrorCode ierr;
2724 
2725 
2726   PetscFunctionBegin;
2727   ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr);
2728   PetscFunctionReturn(0);
2729 }
2730 #else
2731 #undef __FUNCT__
2732 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2733 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2734 {
2735   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2736   PetscErrorCode ierr;
2737 
2738   PetscFunctionBegin;
2739   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2740   PetscFunctionReturn(0);
2741 }
2742 #endif
2743