xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision ae5cfb4a3679d07b6904bd2c794eee8dc00ed47b)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 #if defined(PETSC_THREADCOMM_ACTIVE)
7 #include <petscthreadcomm.h>
8 #endif
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
79 {
80   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
81   PetscErrorCode ierr;
82   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
83   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
84   const PetscInt *irow,*icol;
85   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
86   PetscInt       *aj = a->j,*ai = a->i;
87   MatScalar      *mat_a;
88   Mat            C;
89   PetscBool      flag,sorted;
90 
91   PetscFunctionBegin;
92   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
93   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
94 
95   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
96   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
97   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
98   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
99 
100   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
101   ssmap = smap;
102   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
103   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104   /* determine lens of each row */
105   for (i=0; i<nrows; i++) {
106     kstart  = ai[irow[i]];
107     kend    = kstart + a->ilen[irow[i]];
108     lens[i] = 0;
109     for (k=kstart; k<kend; k++) {
110       if (ssmap[aj[k]]) lens[i]++;
111     }
112   }
113   /* Create and fill new matrix */
114   if (scall == MAT_REUSE_MATRIX) {
115     c = (Mat_SeqBAIJ*)((*B)->data);
116 
117     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
119     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
120     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
121     C    = *B;
122   } else {
123     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
124     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
125     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
126     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
127   }
128   c = (Mat_SeqBAIJ*)(C->data);
129   for (i=0; i<nrows; i++) {
130     row      = irow[i];
131     kstart   = ai[row];
132     kend     = kstart + a->ilen[row];
133     mat_i    = c->i[i];
134     mat_j    = c->j + mat_i;
135     mat_a    = c->a + mat_i*bs2;
136     mat_ilen = c->ilen + i;
137     for (k=kstart; k<kend; k++) {
138       if ((tcol=ssmap[a->j[k]])) {
139         *mat_j++ = tcol - 1;
140         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
141         mat_a   += bs2;
142         (*mat_ilen)++;
143       }
144     }
145   }
146 
147   /* Free work space */
148   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
149   ierr = PetscFree(smap);CHKERRQ(ierr);
150   ierr = PetscFree(lens);CHKERRQ(ierr);
151   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
152   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
153 
154   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
155   *B   = C;
156   PetscFunctionReturn(0);
157 }
158 
159 #undef __FUNCT__
160 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
161 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
162 {
163   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
164   IS             is1,is2;
165   PetscErrorCode ierr;
166   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
167   const PetscInt *irow,*icol;
168 
169   PetscFunctionBegin;
170   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
171   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
172   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
173   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
174 
175   /* Verify if the indices corespond to each element in a block
176    and form the IS with compressed IS */
177   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
178   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
179   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
180   count = 0;
181   for (i=0; i<a->mbs; i++) {
182     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
183     if (vary[i]==bs) iary[count++] = i;
184   }
185   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
186 
187   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
188   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
189   count = 0;
190   for (i=0; i<a->mbs; i++) {
191     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
192     if (vary[i]==bs) iary[count++] = i;
193   }
194   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
195   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
196   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
197   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
198 
199   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
200   ierr = ISDestroy(&is1);CHKERRQ(ierr);
201   ierr = ISDestroy(&is2);CHKERRQ(ierr);
202   PetscFunctionReturn(0);
203 }
204 
205 #undef __FUNCT__
206 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
207 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
208 {
209   PetscErrorCode ierr;
210   PetscInt       i;
211 
212   PetscFunctionBegin;
213   if (scall == MAT_INITIAL_MATRIX) {
214     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
215   }
216 
217   for (i=0; i<n; i++) {
218     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
219   }
220   PetscFunctionReturn(0);
221 }
222 
223 
224 /* -------------------------------------------------------*/
225 /* Should check that shapes of vectors and matrices match */
226 /* -------------------------------------------------------*/
227 
228 #if defined(PETSC_THREADCOMM_ACTIVE)
229 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
230 {
231   PetscErrorCode    ierr;
232   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
233   PetscScalar       *z;
234   const PetscScalar *x;
235   const MatScalar   *aa;
236   PetscInt          *trstarts=A->rmap->trstarts;
237   PetscInt          n,start,end,i;
238   const PetscInt    *aj,*ai;
239   PetscScalar       sum;
240 
241   ierr  = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
242   ierr  = VecGetArray(zz,&z);CHKERRQ(ierr);
243   start = trstarts[thread_id];
244   end   = trstarts[thread_id+1];
245   ai    = a->i;
246   for (i=start; i<end; i++) {
247     n   = ai[i+1] - ai[i];
248     aj  = a->j + ai[i];
249     aa  = a->a + ai[i];
250     sum = 0.0;
251     PetscSparseDensePlusDot(sum,x,aa,aj,n);
252     z[i] = sum;
253   }
254   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
255   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
256   return 0;
257 }
258 
259 #undef __FUNCT__
260 #define __FUNCT__ "MatMult_SeqBAIJ_1"
261 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
262 {
263   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
264   PetscScalar       *z,sum;
265   const PetscScalar *x;
266   const MatScalar   *v;
267   PetscErrorCode    ierr;
268   PetscInt          mbs,i,n;
269   const PetscInt    *idx,*ii,*ridx=NULL;
270   PetscBool         usecprow=a->compressedrow.use;
271 
272   PetscFunctionBegin;
273 
274   if (usecprow) {
275     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
276     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
277     mbs  = a->compressedrow.nrows;
278     ii   = a->compressedrow.i;
279     ridx = a->compressedrow.rindex;
280     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
281     for (i=0; i<mbs; i++) {
282       n   = ii[i+1] - ii[i];
283       v   = a->a + ii[i];
284       idx = a->j + ii[i];
285       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
286       PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
287       sum = 0.0;
288       PetscSparseDensePlusDot(sum,x,v,idx,n);
289       z[ridx[i]] = sum;
290     }
291     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
292     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
293   } else {
294     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr);
295   }
296   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 #else
300 #undef __FUNCT__
301 #define __FUNCT__ "MatMult_SeqBAIJ_1"
302 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
303 {
304   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
305   PetscScalar       *z,sum;
306   const PetscScalar *x;
307   const MatScalar   *v;
308   PetscErrorCode    ierr;
309   PetscInt          mbs,i,n;
310   const PetscInt    *idx,*ii,*ridx=NULL;
311   PetscBool         usecprow=a->compressedrow.use;
312 
313   PetscFunctionBegin;
314   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
315   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
316 
317   if (usecprow) {
318     mbs  = a->compressedrow.nrows;
319     ii   = a->compressedrow.i;
320     ridx = a->compressedrow.rindex;
321     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
322   } else {
323     mbs = a->mbs;
324     ii  = a->i;
325   }
326 
327   for (i=0; i<mbs; i++) {
328     n   = ii[1] - ii[0];
329     v   = a->a + ii[0];
330     idx = a->j + ii[0];
331     ii++;
332     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
333     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
334     sum = 0.0;
335     PetscSparseDensePlusDot(sum,x,v,idx,n);
336     if (usecprow) {
337       z[ridx[i]] = sum;
338     } else {
339       z[i]        = sum;
340     }
341   }
342   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
343   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
344   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 #endif
348 
349 #if defined(PETSC_THREADCOMM_ACTIVE)
350 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
351 {
352   PetscErrorCode    ierr;
353   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
354   PetscScalar       *z,x1,x2,sum1,sum2;
355   const PetscScalar *x,*xb;
356   const MatScalar   *aa;
357   PetscInt          *trstarts=A->rmap->trstarts;
358   PetscInt          n,start,end,i,j;
359   const PetscInt    *aj,*ai;
360 
361   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
362   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
363   start  = trstarts[thread_id] / 2;
364   end    = trstarts[thread_id+1] / 2;
365   ai     = a->i;
366   for (i=start; i<end; i++) {
367     n    = ai[i+1] - ai[i];
368     aj   = a->j + ai[i];
369     aa   = a->a + ai[i]*4;
370     sum1 = 0.0; sum2 = 0.0;
371     for (j=0; j<n; j++) {
372       xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1];
373       sum1 += aa[4*j]*x1   + aa[4*j+2]*x2;
374       sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2;
375     }
376     z[2*i] = sum1; z[2*i+1] = sum2;
377   }
378   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
379   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
380   return 0;
381 }
382 
383 #undef __FUNCT__
384 #define __FUNCT__ "MatMult_SeqBAIJ_2"
385 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
386 {
387   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
388   PetscScalar       *z,x1,x2,sum1,sum2;
389   const PetscScalar *x,*xb;
390   const MatScalar   *v;
391   PetscErrorCode    ierr;
392   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
393   PetscBool         usecprow=a->compressedrow.use;
394 
395   PetscFunctionBegin;
396 
397   if (usecprow) {
398     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
399     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
400     mbs  = a->compressedrow.nrows;
401     ii   = a->compressedrow.i;
402     ridx = a->compressedrow.rindex;
403     for (i=0; i<mbs; i++) {
404       n    = ii[i+1] - ii[i];
405       idx  = a->j + ii[i];
406       v    = a->a + ii[i]*4;
407       sum1 = 0.0; sum2 = 0.0;
408       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
409       PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410       for (j=0; j<n; j++) {
411 	xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1];
412 	sum1 += v[4*j]*x1   + v[4*j+2]*x2;
413 	sum2 += v[4*j+1]*x1 + v[4*j+3]*x2;
414       }
415       z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2;
416     }
417     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
418     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
419   } else {
420     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr);
421   }
422   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
423   PetscFunctionReturn(0);
424 }
425 #else
426 #undef __FUNCT__
427 #define __FUNCT__ "MatMult_SeqBAIJ_2"
428 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
429 {
430   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
431   PetscScalar       *z = 0,sum1,sum2,*zarray;
432   const PetscScalar *x,*xb;
433   PetscScalar       x1,x2;
434   const MatScalar   *v;
435   PetscErrorCode    ierr;
436   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
437   PetscBool         usecprow=a->compressedrow.use;
438 
439   PetscFunctionBegin;
440   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
441   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
442 
443   idx = a->j;
444   v   = a->a;
445   if (usecprow) {
446     mbs  = a->compressedrow.nrows;
447     ii   = a->compressedrow.i;
448     ridx = a->compressedrow.rindex;
449   } else {
450     mbs = a->mbs;
451     ii  = a->i;
452     z   = zarray;
453   }
454 
455   for (i=0; i<mbs; i++) {
456     n           = ii[1] - ii[0]; ii++;
457     sum1        = 0.0; sum2 = 0.0;
458     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
459     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
460     for (j=0; j<n; j++) {
461       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
462       sum1 += v[0]*x1 + v[2]*x2;
463       sum2 += v[1]*x1 + v[3]*x2;
464       v    += 4;
465     }
466     if (usecprow) z = zarray + 2*ridx[i];
467     z[0] = sum1; z[1] = sum2;
468     if (!usecprow) z += 2;
469   }
470   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
471   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
472   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
473   PetscFunctionReturn(0);
474 }
475 #endif
476 
477 #if defined(PETSC_THREADCOMM_ACTIVE)
478 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
479 {
480   PetscErrorCode    ierr;
481   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
482   PetscScalar       *z,x1,x2,x3,sum1,sum2,sum3;
483   const PetscScalar *x,*xb;
484   const MatScalar   *aa;
485   PetscInt          *trstarts=A->rmap->trstarts;
486   PetscInt          n,start,end,i,j;
487   const PetscInt    *aj,*ai;
488 
489   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
490   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
491   start  = trstarts[thread_id] / 3;
492   end    = trstarts[thread_id+1] / 3;
493   ai     = a->i;
494   for (i=start; i<end; i++) {
495     n    = ai[i+1] - ai[i];
496     aj   = a->j + ai[i];
497     aa   = a->a + ai[i]*9;
498     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
499     for (j=0; j<n; j++) {
500       xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
501       sum1 += aa[9*j]*x1   + aa[9*j+3]*x2 + aa[9*j+6]*x3;
502       sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3;
503       sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3;
504     }
505     z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3;
506   }
507   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
508   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
509   return 0;
510 }
511 
512 #undef __FUNCT__
513 #define __FUNCT__ "MatMult_SeqBAIJ_3"
514 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
515 {
516   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
517   PetscScalar       *z,sum1,sum2,sum3,x1,x2,x3;
518   const PetscScalar *x,*xb;
519   const MatScalar   *v;
520   PetscErrorCode    ierr;
521   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
522   PetscBool         usecprow=a->compressedrow.use;
523 
524 
525 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
526 #pragma disjoint(*v,*z,*xb)
527 #endif
528 
529   PetscFunctionBegin;
530 
531   if (usecprow) {
532     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
533     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
534     mbs  = a->compressedrow.nrows;
535     ii   = a->compressedrow.i;
536     ridx = a->compressedrow.rindex;
537     for (i=0; i<mbs; i++) {
538       n    = ii[i+1] - ii[i];
539       idx  = a->j + ii[i];
540       v    = a->a + ii[i]*9;
541       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
542       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
543       PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
544       for (j=0; j<n; j++) {
545 	xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
546 	sum1 += v[9*j]*x1   + v[9*j+3]*x2 + v[9*j+6]*x3;
547 	sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3;
548 	sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3;
549       }
550       z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3;
551     }
552     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
553     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
554   } else {
555     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr);
556   }
557   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
558   PetscFunctionReturn(0);
559 }
560 #else
561 #undef __FUNCT__
562 #define __FUNCT__ "MatMult_SeqBAIJ_3"
563 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
564 {
565   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
566   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
567   const PetscScalar *x,*xb;
568   const MatScalar   *v;
569   PetscErrorCode    ierr;
570   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
571   PetscBool         usecprow=a->compressedrow.use;
572 
573 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
574 #pragma disjoint(*v,*z,*xb)
575 #endif
576 
577   PetscFunctionBegin;
578   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
579   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
580 
581   idx = a->j;
582   v   = a->a;
583   if (usecprow) {
584     mbs  = a->compressedrow.nrows;
585     ii   = a->compressedrow.i;
586     ridx = a->compressedrow.rindex;
587   } else {
588     mbs = a->mbs;
589     ii  = a->i;
590     z   = zarray;
591   }
592 
593   for (i=0; i<mbs; i++) {
594     n           = ii[1] - ii[0]; ii++;
595     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
596     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
597     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
598     for (j=0; j<n; j++) {
599       xb = x + 3*(*idx++);
600       x1 = xb[0];
601       x2 = xb[1];
602       x3 = xb[2];
603 
604       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
605       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
606       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
607       v    += 9;
608     }
609     if (usecprow) z = zarray + 3*ridx[i];
610     z[0] = sum1; z[1] = sum2; z[2] = sum3;
611     if (!usecprow) z += 3;
612   }
613   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
614   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
615   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
616   PetscFunctionReturn(0);
617 }
618 #endif
619 
620 #if defined(PETSC_THREADCOMM_ACTIVE)
621 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
622 {
623   PetscErrorCode    ierr;
624   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
625   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
626   const PetscScalar *x,*xb;
627   const MatScalar   *aa;
628   PetscInt          *trstarts=A->rmap->trstarts;
629   PetscInt          n,start,end,i,j;
630   const PetscInt    *aj,*ai;
631 
632   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
633   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
634   start  = trstarts[thread_id] / 4;
635   end    = trstarts[thread_id+1] / 4;
636   ai     = a->i;
637   for (i=start; i<end; i++) {
638     n    = ai[i+1] - ai[i];
639     aj   = a->j + ai[i];
640     aa   = a->a + ai[i]*16;
641     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
642     for (j=0; j<n; j++) {
643       xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
644       sum1 += aa[16*j]*x1   + aa[16*j+4]*x2 + aa[16*j+8]*x3  + aa[16*j+12]*x4;
645       sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3  + aa[16*j+13]*x4;
646       sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4;
647       sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4;
648     }
649     z[4*i]   = sum1; z[4*i+1] = sum2;
650     z[4*i+2] = sum3; z[4*i+3] = sum4;
651   }
652   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
653   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
654   return 0;
655 }
656 
657 #undef __FUNCT__
658 #define __FUNCT__ "MatMult_SeqBAIJ_4"
659 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
660 {
661   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
662   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
663   const PetscScalar *x,*xb;
664   const MatScalar   *v;
665   PetscErrorCode    ierr;
666   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
667   PetscBool         usecprow=a->compressedrow.use;
668 
669   PetscFunctionBegin;
670 
671   if (usecprow) {
672     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
673     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
674     mbs  = a->compressedrow.nrows;
675     ii   = a->compressedrow.i;
676     ridx = a->compressedrow.rindex;
677     for (i=0; i<mbs; i++) {
678       n = ii[i+1] - ii[1];
679       idx  = a->j + ii[i];
680       v    = a->a + ii[i]*16;
681       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
682       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
683       PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
684       for (j=0; j<n; j++) {
685 	xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
686 	sum1 += v[16*j]*x1   + v[16*j+4]*x2 + v[16*j+8]*x3  + v[16*j+12]*x4;
687 	sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3  + v[16*j+13]*x4;
688 	sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4;
689 	sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4;
690       }
691       z[4*ridx[i]]   = sum1; z[4*ridx[i]+1] = sum2;
692       z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4;
693     }
694     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
695     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
696   } else {
697     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr);
698   }
699   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
700   PetscFunctionReturn(0);
701 }
702 #else
703 #undef __FUNCT__
704 #define __FUNCT__ "MatMult_SeqBAIJ_4"
705 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
706 {
707   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
708   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
709   const PetscScalar *x,*xb;
710   const MatScalar   *v;
711   PetscErrorCode    ierr;
712   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
713   PetscBool         usecprow=a->compressedrow.use;
714 
715   PetscFunctionBegin;
716   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
717   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
718 
719   idx = a->j;
720   v   = a->a;
721   if (usecprow) {
722     mbs  = a->compressedrow.nrows;
723     ii   = a->compressedrow.i;
724     ridx = a->compressedrow.rindex;
725   } else {
726     mbs = a->mbs;
727     ii  = a->i;
728     z   = zarray;
729   }
730 
731   for (i=0; i<mbs; i++) {
732     n = ii[1] - ii[0];
733     ii++;
734     sum1 = 0.0;
735     sum2 = 0.0;
736     sum3 = 0.0;
737     sum4 = 0.0;
738 
739     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
740     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
741     for (j=0; j<n; j++) {
742       xb    = x + 4*(*idx++);
743       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
744       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
745       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
746       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
747       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
748       v    += 16;
749     }
750     if (usecprow) z = zarray + 4*ridx[i];
751     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
752     if (!usecprow) z += 4;
753   }
754   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
755   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
756   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
757   PetscFunctionReturn(0);
758 }
759 #endif
760 
761 #if defined(PETSC_THREADCOMM_ACTIVE)
762 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
763 {
764   PetscErrorCode    ierr;
765   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
766   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
767   const PetscScalar *x,*xb;
768   const MatScalar   *aa;
769   PetscInt          *trstarts=A->rmap->trstarts;
770   PetscInt          n,start,end,i,j;
771   const PetscInt    *aj,*ai;
772 
773   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
774   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
775   start  = trstarts[thread_id] / 5;
776   end    = trstarts[thread_id+1] / 5;
777   ai     = a->i;
778   for (i=start; i<end; i++) {
779     n    = ai[i+1] - ai[i];
780     aj   = a->j + ai[i];
781     aa   = a->a + ai[i]*25;
782     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
783     for (j=0; j<n; j++) {
784       xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
785       sum1 += aa[25*j]*x1   + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5;
786       sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5;
787       sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5;
788       sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5;
789       sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5;
790     }
791     z[5*i]   = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3;
792     z[5*i+3] = sum4; z[5*i+4] = sum5;
793   }
794   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
795   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
796   return 0;
797 }
798 
799 #undef __FUNCT__
800 #define __FUNCT__ "MatMult_SeqBAIJ_5"
801 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
802 {
803   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
804   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
805   const PetscScalar *xb,*x;
806   const MatScalar   *v;
807   PetscErrorCode    ierr;
808   const PetscInt    *idx,*ii,*ridx=NULL;
809   PetscInt          mbs,i,j,n;
810   PetscBool         usecprow=a->compressedrow.use;
811 
812   PetscFunctionBegin;
813 
814   if (usecprow) {
815     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
816     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
817     mbs  = a->compressedrow.nrows;
818     ii   = a->compressedrow.i;
819     ridx = a->compressedrow.rindex;
820     for (i=0; i<mbs; i++) {
821       n    = ii[i+1] - ii[i];
822       idx  = a->j + ii[i];
823       v    = a->a + ii[i]*25;
824       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
825       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
826       PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
827       for (j=0; j<n; j++) {
828 	xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
829 	sum1 += v[25*j]*x1   + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5;
830 	sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5;
831 	sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5;
832 	sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5;
833 	sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5;
834       }
835       z[5*ridx[i]]   = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3;
836       z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5;
837     }
838     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
839     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
840   } else {
841     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr);
842   }
843   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
844   PetscFunctionReturn(0);
845 }
846 #else
847 #undef __FUNCT__
848 #define __FUNCT__ "MatMult_SeqBAIJ_5"
849 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
850 {
851   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
852   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
853   const PetscScalar *xb,*x;
854   const MatScalar   *v;
855   PetscErrorCode    ierr;
856   const PetscInt    *idx,*ii,*ridx=NULL;
857   PetscInt          mbs,i,j,n;
858   PetscBool         usecprow=a->compressedrow.use;
859 
860   PetscFunctionBegin;
861   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
862   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
863 
864   idx = a->j;
865   v   = a->a;
866   if (usecprow) {
867     mbs  = a->compressedrow.nrows;
868     ii   = a->compressedrow.i;
869     ridx = a->compressedrow.rindex;
870   } else {
871     mbs = a->mbs;
872     ii  = a->i;
873     z   = zarray;
874   }
875 
876   for (i=0; i<mbs; i++) {
877     n           = ii[1] - ii[0]; ii++;
878     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
879     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
880     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
881     for (j=0; j<n; j++) {
882       xb    = x + 5*(*idx++);
883       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
884       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
885       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
886       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
887       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
888       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
889       v    += 25;
890     }
891     if (usecprow) z = zarray + 5*ridx[i];
892     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
893     if (!usecprow) z += 5;
894   }
895   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
896   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
897   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
898   PetscFunctionReturn(0);
899 }
900 #endif
901 
902 
903 #if defined(PETSC_THREADCOMM_ACTIVE)
904 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
905 {
906   PetscErrorCode    ierr;
907   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
908   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
909   const PetscScalar *x,*xb;
910   const MatScalar   *aa;
911   PetscInt          *trstarts=A->rmap->trstarts;
912   PetscInt          n,start,end,i,j;
913   const PetscInt    *aj,*ai;
914 
915   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
916   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
917   start  = trstarts[thread_id] / 6;
918   end    = trstarts[thread_id+1] / 6;
919   ai     = a->i;
920   for (i=start; i<end; i++) {
921     n    = ai[i+1] - ai[i];
922     aj   = a->j + ai[i];
923     aa   = a->a + ai[i]*36;
924     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
925     for (j=0; j<n; j++) {
926       xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
927       sum1 += aa[36*j]*x1   + aa[36*j+6]*x2  + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6;
928       sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2  + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6;
929       sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2  + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6;
930       sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2  + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6;
931       sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6;
932       sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6;
933     }
934     z[6*i]   = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3;
935     z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6;
936   }
937   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
938   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
939   return 0;
940 }
941 
942 #undef __FUNCT__
943 #define __FUNCT__ "MatMult_SeqBAIJ_6"
944 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
945 {
946   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
947   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
948   const PetscScalar *x,*xb;
949   const MatScalar   *v;
950   PetscErrorCode    ierr;
951   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
952   PetscBool         usecprow=a->compressedrow.use;
953 
954   PetscFunctionBegin;
955 
956   if (usecprow) {
957     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
958     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
959     mbs  = a->compressedrow.nrows;
960     ii   = a->compressedrow.i;
961     ridx = a->compressedrow.rindex;
962     for (i=0; i<mbs; i++) {
963       n  = ii[i+1] - ii[i];
964       idx  = a->j + ii[i];
965       v    = a->a + ii[i]*36;
966       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
967       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
968       PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
969       for (j=0; j<n; j++) {
970 	xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
971 	sum1 += v[36*j]*x1   + v[36*j+6]*x2  + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6;
972 	sum2 += v[36*j+1]*x1 + v[36*j+7]*x2  + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6;
973 	sum3 += v[36*j+2]*x1 + v[36*j+8]*x2  + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6;
974 	sum4 += v[36*j+3]*x1 + v[36*j+9]*x2  + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6;
975 	sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6;
976 	sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6;
977       }
978       z[6*ridx[i]]   = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3;
979       z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6;
980     }
981     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
982     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
983   } else {
984     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr);
985   }
986   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 #else
990 #undef __FUNCT__
991 #define __FUNCT__ "MatMult_SeqBAIJ_6"
992 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
993 {
994   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
995   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
996   const PetscScalar *x,*xb;
997   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
998   const MatScalar   *v;
999   PetscErrorCode    ierr;
1000   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1001   PetscBool         usecprow=a->compressedrow.use;
1002 
1003   PetscFunctionBegin;
1004   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1005   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1006 
1007   idx = a->j;
1008   v   = a->a;
1009   if (usecprow) {
1010     mbs  = a->compressedrow.nrows;
1011     ii   = a->compressedrow.i;
1012     ridx = a->compressedrow.rindex;
1013   } else {
1014     mbs = a->mbs;
1015     ii  = a->i;
1016     z   = zarray;
1017   }
1018 
1019   for (i=0; i<mbs; i++) {
1020     n  = ii[1] - ii[0];
1021     ii++;
1022     sum1 = 0.0;
1023     sum2 = 0.0;
1024     sum3 = 0.0;
1025     sum4 = 0.0;
1026     sum5 = 0.0;
1027     sum6 = 0.0;
1028 
1029     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1030     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1031     for (j=0; j<n; j++) {
1032       xb    = x + 6*(*idx++);
1033       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1034       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1035       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1036       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1037       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1038       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1039       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1040       v    += 36;
1041     }
1042     if (usecprow) z = zarray + 6*ridx[i];
1043     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1044     if (!usecprow) z += 6;
1045   }
1046 
1047   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1048   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1049   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
1050   PetscFunctionReturn(0);
1051 }
1052 #endif
1053 
1054 #if defined(PETSC_THREADCOMM_ACTIVE)
1055 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
1056 {
1057   PetscErrorCode    ierr;
1058   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1059   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1060   const PetscScalar *x,*xb;
1061   const MatScalar   *aa;
1062   PetscInt          *trstarts=A->rmap->trstarts;
1063   PetscInt          n,start,end,i,j;
1064   const PetscInt    *aj,*ai;
1065 
1066   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1067   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
1068   start  = trstarts[thread_id] / 7;
1069   end    = trstarts[thread_id+1] / 7;
1070   ai     = a->i;
1071   for (i=start; i<end; i++) {
1072     n    = ai[i+1] - ai[i];
1073     aj   = a->j + ai[i];
1074     aa   = a->a + ai[i]*49;
1075     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1076     for (j=0; j<n; j++) {
1077       xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1078       sum1 += aa[49*j]*x1   + aa[49*j+7]*x2  + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7;
1079       sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2  + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7;
1080       sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2  + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7;
1081       sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7;
1082       sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7;
1083       sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7;
1084       sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7;
1085     }
1086     z[7*i]   = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4;
1087     z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7;
1088   }
1089   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1091   return 0;
1092 }
1093 
1094 #undef __FUNCT__
1095 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1096 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1097 {
1098   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1099   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1100   const PetscScalar *x,*xb;
1101   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1102   const MatScalar   *v;
1103   PetscErrorCode    ierr;
1104   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1105   PetscBool         usecprow=a->compressedrow.use;
1106 
1107   PetscFunctionBegin;
1108 
1109   if (usecprow) {
1110     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1111     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1112     mbs  = a->compressedrow.nrows;
1113     ii   = a->compressedrow.i;
1114     ridx = a->compressedrow.rindex;
1115     for (i=0; i<mbs; i++) {
1116       n    = ii[i+1] - ii[i];
1117       idx  = a->j + ii[i];
1118       v    = a->a + ii[i]*49;
1119       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1120       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1121       PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1122       for (j=0; j<n; j++) {
1123 	xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1124 	sum1 += v[49*j]*x1   + v[49*j+7]*x2  + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7;
1125 	sum2 += v[49*j+1]*x1 + v[49*j+8]*x2  + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7;
1126 	sum3 += v[49*j+2]*x1 + v[49*j+9]*x2  + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7;
1127 	sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7;
1128 	sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7;
1129 	sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7;
1130 	sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7;
1131       }
1132       z[7*ridx[i]]   = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4;
1133       z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7;
1134     }
1135     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1136     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1137   } else {
1138     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1139   }
1140   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1141   PetscFunctionReturn(0);
1142 }
1143 #else
1144 #undef __FUNCT__
1145 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1146 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1147 {
1148   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1149   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1150   const PetscScalar *x,*xb;
1151   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1152   const MatScalar   *v;
1153   PetscErrorCode    ierr;
1154   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1155   PetscBool         usecprow=a->compressedrow.use;
1156 
1157   PetscFunctionBegin;
1158   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1159   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1160 
1161   idx = a->j;
1162   v   = a->a;
1163   if (usecprow) {
1164     mbs  = a->compressedrow.nrows;
1165     ii   = a->compressedrow.i;
1166     ridx = a->compressedrow.rindex;
1167   } else {
1168     mbs = a->mbs;
1169     ii  = a->i;
1170     z   = zarray;
1171   }
1172 
1173   for (i=0; i<mbs; i++) {
1174     n  = ii[1] - ii[0];
1175     ii++;
1176     sum1 = 0.0;
1177     sum2 = 0.0;
1178     sum3 = 0.0;
1179     sum4 = 0.0;
1180     sum5 = 0.0;
1181     sum6 = 0.0;
1182     sum7 = 0.0;
1183 
1184     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1185     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1186     for (j=0; j<n; j++) {
1187       xb    = x + 7*(*idx++);
1188       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1189       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1196       v    += 49;
1197     }
1198     if (usecprow) z = zarray + 7*ridx[i];
1199     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1200     if (!usecprow) z += 7;
1201   }
1202 
1203   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1204   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1205   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1206   PetscFunctionReturn(0);
1207 }
1208 #endif
1209 
1210 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1211 /* Default MatMult for block size 15 */
1212 
1213 #undef __FUNCT__
1214 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
1215 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1216 {
1217   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1218   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1219   const PetscScalar *x,*xb;
1220   PetscScalar       *zarray,xv;
1221   const MatScalar   *v;
1222   PetscErrorCode    ierr;
1223   const PetscInt    *ii,*ij=a->j,*idx;
1224   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1225   PetscBool         usecprow=a->compressedrow.use;
1226 
1227   PetscFunctionBegin;
1228   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1229   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1230 
1231   v = a->a;
1232   if (usecprow) {
1233     mbs  = a->compressedrow.nrows;
1234     ii   = a->compressedrow.i;
1235     ridx = a->compressedrow.rindex;
1236   } else {
1237     mbs = a->mbs;
1238     ii  = a->i;
1239     z   = zarray;
1240   }
1241 
1242   for (i=0; i<mbs; i++) {
1243     n    = ii[i+1] - ii[i];
1244     idx  = ij + ii[i];
1245     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1246     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1247 
1248     for (j=0; j<n; j++) {
1249       xb = x + 15*(idx[j]);
1250 
1251       for (k=0; k<15; k++) {
1252         xv     =  xb[k];
1253         sum1  += v[0]*xv;
1254         sum2  += v[1]*xv;
1255         sum3  += v[2]*xv;
1256         sum4  += v[3]*xv;
1257         sum5  += v[4]*xv;
1258         sum6  += v[5]*xv;
1259         sum7  += v[6]*xv;
1260         sum8  += v[7]*xv;
1261         sum9  += v[8]*xv;
1262         sum10 += v[9]*xv;
1263         sum11 += v[10]*xv;
1264         sum12 += v[11]*xv;
1265         sum13 += v[12]*xv;
1266         sum14 += v[13]*xv;
1267         sum15 += v[14]*xv;
1268         v     += 15;
1269       }
1270     }
1271     if (usecprow) z = zarray + 15*ridx[i];
1272     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1273     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1274 
1275     if (!usecprow) z += 15;
1276   }
1277 
1278   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1279   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1280   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1281   PetscFunctionReturn(0);
1282 }
1283 
1284 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1285 #undef __FUNCT__
1286 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
1287 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1288 {
1289   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1290   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1291   const PetscScalar *x,*xb;
1292   PetscScalar       x1,x2,x3,x4,*zarray;
1293   const MatScalar   *v;
1294   PetscErrorCode    ierr;
1295   const PetscInt    *ii,*ij=a->j,*idx;
1296   PetscInt          mbs,i,j,n,*ridx=NULL;
1297   PetscBool         usecprow=a->compressedrow.use;
1298 
1299   PetscFunctionBegin;
1300   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1301   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1302 
1303   v = a->a;
1304   if (usecprow) {
1305     mbs  = a->compressedrow.nrows;
1306     ii   = a->compressedrow.i;
1307     ridx = a->compressedrow.rindex;
1308   } else {
1309     mbs = a->mbs;
1310     ii  = a->i;
1311     z   = zarray;
1312   }
1313 
1314   for (i=0; i<mbs; i++) {
1315     n    = ii[i+1] - ii[i];
1316     idx  = ij + ii[i];
1317     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1318     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1319 
1320     for (j=0; j<n; j++) {
1321       xb = x + 15*(idx[j]);
1322       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1323 
1324       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1325       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1326       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1327       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1328       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1329       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1330       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1331       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1332       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1333       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1334       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1335       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1336       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1337       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1338       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1339 
1340       v += 60;
1341 
1342       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1343 
1344       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1345       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1346       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1347       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1348       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1349       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1350       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1351       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1352       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1353       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1354       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1355       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1356       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1357       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1358       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1359       v     += 60;
1360 
1361       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1362       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1363       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1364       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1365       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1366       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1367       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1368       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1369       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1370       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1371       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1372       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1373       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1374       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1375       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1376       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1377       v     += 60;
1378 
1379       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1380       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1381       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1382       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1383       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1384       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1385       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1386       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1387       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1388       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1389       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1390       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1391       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1392       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1393       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1394       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1395       v     += 45;
1396     }
1397     if (usecprow) z = zarray + 15*ridx[i];
1398     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1399     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1400 
1401     if (!usecprow) z += 15;
1402   }
1403 
1404   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1405   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1406   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1407   PetscFunctionReturn(0);
1408 }
1409 
1410 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1411 #undef __FUNCT__
1412 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
1413 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1414 {
1415   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1416   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1417   const PetscScalar *x,*xb;
1418   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1419   const MatScalar   *v;
1420   PetscErrorCode    ierr;
1421   const PetscInt    *ii,*ij=a->j,*idx;
1422   PetscInt          mbs,i,j,n,*ridx=NULL;
1423   PetscBool         usecprow=a->compressedrow.use;
1424 
1425   PetscFunctionBegin;
1426   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1427   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1428 
1429   v = a->a;
1430   if (usecprow) {
1431     mbs  = a->compressedrow.nrows;
1432     ii   = a->compressedrow.i;
1433     ridx = a->compressedrow.rindex;
1434   } else {
1435     mbs = a->mbs;
1436     ii  = a->i;
1437     z   = zarray;
1438   }
1439 
1440   for (i=0; i<mbs; i++) {
1441     n    = ii[i+1] - ii[i];
1442     idx  = ij + ii[i];
1443     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1444     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1445 
1446     for (j=0; j<n; j++) {
1447       xb = x + 15*(idx[j]);
1448       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1449       x8 = xb[7];
1450 
1451       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1452       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1453       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1454       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1455       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1456       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1457       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1458       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1459       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1460       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1461       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1462       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1463       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1464       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1465       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1466       v     += 120;
1467 
1468       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1469 
1470       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1471       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1472       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1473       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1474       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1475       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1476       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1477       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1478       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1479       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1480       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1481       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1482       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1483       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1484       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1485       v     += 105;
1486     }
1487     if (usecprow) z = zarray + 15*ridx[i];
1488     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1489     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1490 
1491     if (!usecprow) z += 15;
1492   }
1493 
1494   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1495   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1496   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1497   PetscFunctionReturn(0);
1498 }
1499 
1500 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1501 
1502 #undef __FUNCT__
1503 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
1504 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1505 {
1506   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1507   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1508   const PetscScalar *x,*xb;
1509   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1510   const MatScalar   *v;
1511   PetscErrorCode    ierr;
1512   const PetscInt    *ii,*ij=a->j,*idx;
1513   PetscInt          mbs,i,j,n,*ridx=NULL;
1514   PetscBool         usecprow=a->compressedrow.use;
1515 
1516   PetscFunctionBegin;
1517   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1518   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1519 
1520   v = a->a;
1521   if (usecprow) {
1522     mbs  = a->compressedrow.nrows;
1523     ii   = a->compressedrow.i;
1524     ridx = a->compressedrow.rindex;
1525   } else {
1526     mbs = a->mbs;
1527     ii  = a->i;
1528     z   = zarray;
1529   }
1530 
1531   for (i=0; i<mbs; i++) {
1532     n    = ii[i+1] - ii[i];
1533     idx  = ij + ii[i];
1534     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1535     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1536 
1537     for (j=0; j<n; j++) {
1538       xb = x + 15*(idx[j]);
1539       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1540       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1541 
1542       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1543       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1544       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1545       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1546       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1547       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1548       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1549       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1550       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1551       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1552       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1553       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1554       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1555       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1556       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1557       v     += 225;
1558     }
1559     if (usecprow) z = zarray + 15*ridx[i];
1560     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1561     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1562 
1563     if (!usecprow) z += 15;
1564   }
1565 
1566   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1567   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1568   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 
1573 /*
1574     This will not work with MatScalar == float because it calls the BLAS
1575 */
1576 #undef __FUNCT__
1577 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1578 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1579 {
1580   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1581   PetscScalar       *z = 0,*work,*workt,*zarray;
1582   const PetscScalar *x,*xb;
1583   const MatScalar   *v;
1584   PetscErrorCode    ierr;
1585   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1586   const PetscInt    *idx,*ii,*ridx=NULL;
1587   PetscInt          ncols,k;
1588   PetscBool         usecprow=a->compressedrow.use;
1589 
1590   PetscFunctionBegin;
1591   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1592   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1593 
1594   idx = a->j;
1595   v   = a->a;
1596   if (usecprow) {
1597     mbs  = a->compressedrow.nrows;
1598     ii   = a->compressedrow.i;
1599     ridx = a->compressedrow.rindex;
1600   } else {
1601     mbs = a->mbs;
1602     ii  = a->i;
1603     z   = zarray;
1604   }
1605 
1606   if (!a->mult_work) {
1607     k    = PetscMax(A->rmap->n,A->cmap->n);
1608     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1609   }
1610   work = a->mult_work;
1611   for (i=0; i<mbs; i++) {
1612     n           = ii[1] - ii[0]; ii++;
1613     ncols       = n*bs;
1614     workt       = work;
1615     for (j=0; j<n; j++) {
1616       xb = x + bs*(*idx++);
1617       for (k=0; k<bs; k++) workt[k] = xb[k];
1618       workt += bs;
1619     }
1620     if (usecprow) z = zarray + bs*ridx[i];
1621     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1622     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1623     v += n*bs2;
1624     if (!usecprow) z += bs;
1625   }
1626   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1627   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1628   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 #undef __FUNCT__
1633 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1634 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1635 {
1636   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1637   const PetscScalar *x;
1638   PetscScalar       *y,*z,sum;
1639   const MatScalar   *v;
1640   PetscErrorCode    ierr;
1641   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1642   const PetscInt    *idx,*ii;
1643   PetscBool         usecprow=a->compressedrow.use;
1644 
1645   PetscFunctionBegin;
1646   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1647   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1648 
1649   idx = a->j;
1650   v   = a->a;
1651   if (usecprow) {
1652     if (zz != yy) {
1653       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1654     }
1655     mbs  = a->compressedrow.nrows;
1656     ii   = a->compressedrow.i;
1657     ridx = a->compressedrow.rindex;
1658   } else {
1659     ii = a->i;
1660   }
1661 
1662   for (i=0; i<mbs; i++) {
1663     n = ii[1] - ii[0];
1664     ii++;
1665     if (!usecprow) {
1666       sum         = y[i];
1667     } else {
1668       sum = y[ridx[i]];
1669     }
1670     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1671     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1672     PetscSparseDensePlusDot(sum,x,v,idx,n);
1673     v   += n;
1674     idx += n;
1675     if (usecprow) {
1676       z[ridx[i]] = sum;
1677     } else {
1678       z[i] = sum;
1679     }
1680   }
1681   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1682   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1683   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1684   PetscFunctionReturn(0);
1685 }
1686 
1687 #undef __FUNCT__
1688 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1689 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1690 {
1691   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1692   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1693   const PetscScalar *x,*xb;
1694   PetscScalar       x1,x2,*yarray,*zarray;
1695   const MatScalar   *v;
1696   PetscErrorCode    ierr;
1697   PetscInt          mbs = a->mbs,i,n,j;
1698   const PetscInt    *idx,*ii,*ridx = NULL;
1699   PetscBool         usecprow = a->compressedrow.use;
1700 
1701   PetscFunctionBegin;
1702   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1703   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1704 
1705   idx = a->j;
1706   v   = a->a;
1707   if (usecprow) {
1708     if (zz != yy) {
1709       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1710     }
1711     mbs  = a->compressedrow.nrows;
1712     ii   = a->compressedrow.i;
1713     ridx = a->compressedrow.rindex;
1714     if (zz != yy) {
1715       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1716     }
1717   } else {
1718     ii = a->i;
1719     y  = yarray;
1720     z  = zarray;
1721   }
1722 
1723   for (i=0; i<mbs; i++) {
1724     n = ii[1] - ii[0]; ii++;
1725     if (usecprow) {
1726       z = zarray + 2*ridx[i];
1727       y = yarray + 2*ridx[i];
1728     }
1729     sum1 = y[0]; sum2 = y[1];
1730     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1731     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1732     for (j=0; j<n; j++) {
1733       xb = x + 2*(*idx++);
1734       x1 = xb[0];
1735       x2 = xb[1];
1736 
1737       sum1 += v[0]*x1 + v[2]*x2;
1738       sum2 += v[1]*x1 + v[3]*x2;
1739       v    += 4;
1740     }
1741     z[0] = sum1; z[1] = sum2;
1742     if (!usecprow) {
1743       z += 2; y += 2;
1744     }
1745   }
1746   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1747   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1748   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1749   PetscFunctionReturn(0);
1750 }
1751 
1752 #undef __FUNCT__
1753 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1754 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1755 {
1756   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1757   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1758   const PetscScalar *x,*xb;
1759   const MatScalar   *v;
1760   PetscErrorCode    ierr;
1761   PetscInt          mbs = a->mbs,i,j,n;
1762   const PetscInt    *idx,*ii,*ridx = NULL;
1763   PetscBool         usecprow = a->compressedrow.use;
1764 
1765   PetscFunctionBegin;
1766   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1767   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1768 
1769   idx = a->j;
1770   v   = a->a;
1771   if (usecprow) {
1772     if (zz != yy) {
1773       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1774     }
1775     mbs  = a->compressedrow.nrows;
1776     ii   = a->compressedrow.i;
1777     ridx = a->compressedrow.rindex;
1778   } else {
1779     ii = a->i;
1780     y  = yarray;
1781     z  = zarray;
1782   }
1783 
1784   for (i=0; i<mbs; i++) {
1785     n = ii[1] - ii[0]; ii++;
1786     if (usecprow) {
1787       z = zarray + 3*ridx[i];
1788       y = yarray + 3*ridx[i];
1789     }
1790     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1791     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1792     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1793     for (j=0; j<n; j++) {
1794       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1795       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1796       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1797       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1798       v    += 9;
1799     }
1800     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1801     if (!usecprow) {
1802       z += 3; y += 3;
1803     }
1804   }
1805   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1806   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1807   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1808   PetscFunctionReturn(0);
1809 }
1810 
1811 #undef __FUNCT__
1812 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1813 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1814 {
1815   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1816   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1817   const PetscScalar *x,*xb;
1818   const MatScalar   *v;
1819   PetscErrorCode    ierr;
1820   PetscInt          mbs = a->mbs,i,j,n;
1821   const PetscInt    *idx,*ii,*ridx=NULL;
1822   PetscBool         usecprow=a->compressedrow.use;
1823 
1824   PetscFunctionBegin;
1825   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1826   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1827 
1828   idx = a->j;
1829   v   = a->a;
1830   if (usecprow) {
1831     if (zz != yy) {
1832       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1833     }
1834     mbs  = a->compressedrow.nrows;
1835     ii   = a->compressedrow.i;
1836     ridx = a->compressedrow.rindex;
1837   } else {
1838     ii = a->i;
1839     y  = yarray;
1840     z  = zarray;
1841   }
1842 
1843   for (i=0; i<mbs; i++) {
1844     n = ii[1] - ii[0]; ii++;
1845     if (usecprow) {
1846       z = zarray + 4*ridx[i];
1847       y = yarray + 4*ridx[i];
1848     }
1849     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1850     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1851     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1852     for (j=0; j<n; j++) {
1853       xb    = x + 4*(*idx++);
1854       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1855       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1856       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1857       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1858       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1859       v    += 16;
1860     }
1861     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1862     if (!usecprow) {
1863       z += 4; y += 4;
1864     }
1865   }
1866   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1867   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1868   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1869   PetscFunctionReturn(0);
1870 }
1871 
1872 #undef __FUNCT__
1873 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1874 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1875 {
1876   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1877   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1878   const PetscScalar *x,*xb;
1879   PetscScalar       *yarray,*zarray;
1880   const MatScalar   *v;
1881   PetscErrorCode    ierr;
1882   PetscInt          mbs = a->mbs,i,j,n;
1883   const PetscInt    *idx,*ii,*ridx = NULL;
1884   PetscBool         usecprow=a->compressedrow.use;
1885 
1886   PetscFunctionBegin;
1887   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1888   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1889 
1890   idx = a->j;
1891   v   = a->a;
1892   if (usecprow) {
1893     if (zz != yy) {
1894       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1895     }
1896     mbs  = a->compressedrow.nrows;
1897     ii   = a->compressedrow.i;
1898     ridx = a->compressedrow.rindex;
1899   } else {
1900     ii = a->i;
1901     y  = yarray;
1902     z  = zarray;
1903   }
1904 
1905   for (i=0; i<mbs; i++) {
1906     n = ii[1] - ii[0]; ii++;
1907     if (usecprow) {
1908       z = zarray + 5*ridx[i];
1909       y = yarray + 5*ridx[i];
1910     }
1911     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1912     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1913     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1914     for (j=0; j<n; j++) {
1915       xb    = x + 5*(*idx++);
1916       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1917       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1918       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1919       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1920       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1921       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1922       v    += 25;
1923     }
1924     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1925     if (!usecprow) {
1926       z += 5; y += 5;
1927     }
1928   }
1929   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1930   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1931   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1932   PetscFunctionReturn(0);
1933 }
1934 #undef __FUNCT__
1935 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1936 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1937 {
1938   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1939   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1940   const PetscScalar *x,*xb;
1941   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1942   const MatScalar   *v;
1943   PetscErrorCode    ierr;
1944   PetscInt          mbs = a->mbs,i,j,n;
1945   const PetscInt    *idx,*ii,*ridx=NULL;
1946   PetscBool         usecprow=a->compressedrow.use;
1947 
1948   PetscFunctionBegin;
1949   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1950   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1951 
1952   idx = a->j;
1953   v   = a->a;
1954   if (usecprow) {
1955     if (zz != yy) {
1956       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1957     }
1958     mbs  = a->compressedrow.nrows;
1959     ii   = a->compressedrow.i;
1960     ridx = a->compressedrow.rindex;
1961   } else {
1962     ii = a->i;
1963     y  = yarray;
1964     z  = zarray;
1965   }
1966 
1967   for (i=0; i<mbs; i++) {
1968     n = ii[1] - ii[0]; ii++;
1969     if (usecprow) {
1970       z = zarray + 6*ridx[i];
1971       y = yarray + 6*ridx[i];
1972     }
1973     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1974     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1975     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1976     for (j=0; j<n; j++) {
1977       xb    = x + 6*(*idx++);
1978       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1979       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1980       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1981       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1982       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1983       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1984       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1985       v    += 36;
1986     }
1987     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1988     if (!usecprow) {
1989       z += 6; y += 6;
1990     }
1991   }
1992   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1993   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1994   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1995   PetscFunctionReturn(0);
1996 }
1997 
1998 #undef __FUNCT__
1999 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
2000 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2001 {
2002   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2003   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2004   const PetscScalar *x,*xb;
2005   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2006   const MatScalar   *v;
2007   PetscErrorCode    ierr;
2008   PetscInt          mbs = a->mbs,i,j,n;
2009   const PetscInt    *idx,*ii,*ridx = NULL;
2010   PetscBool         usecprow=a->compressedrow.use;
2011 
2012   PetscFunctionBegin;
2013   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2014   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2015 
2016   idx = a->j;
2017   v   = a->a;
2018   if (usecprow) {
2019     if (zz != yy) {
2020       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2021     }
2022     mbs  = a->compressedrow.nrows;
2023     ii   = a->compressedrow.i;
2024     ridx = a->compressedrow.rindex;
2025   } else {
2026     ii = a->i;
2027     y  = yarray;
2028     z  = zarray;
2029   }
2030 
2031   for (i=0; i<mbs; i++) {
2032     n = ii[1] - ii[0]; ii++;
2033     if (usecprow) {
2034       z = zarray + 7*ridx[i];
2035       y = yarray + 7*ridx[i];
2036     }
2037     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2038     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2039     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2040     for (j=0; j<n; j++) {
2041       xb    = x + 7*(*idx++);
2042       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2043       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2044       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2045       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2046       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2047       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2048       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2049       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2050       v    += 49;
2051     }
2052     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2053     if (!usecprow) {
2054       z += 7; y += 7;
2055     }
2056   }
2057   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2058   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2059   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
2060   PetscFunctionReturn(0);
2061 }
2062 
2063 #undef __FUNCT__
2064 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
2065 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2066 {
2067   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2068   PetscScalar       *z = 0,*work,*workt,*zarray;
2069   const PetscScalar *x,*xb;
2070   const MatScalar   *v;
2071   PetscErrorCode    ierr;
2072   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2073   PetscInt          ncols,k;
2074   const PetscInt    *ridx = NULL,*idx,*ii;
2075   PetscBool         usecprow = a->compressedrow.use;
2076 
2077   PetscFunctionBegin;
2078   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2079   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2080   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2081 
2082   idx = a->j;
2083   v   = a->a;
2084   if (usecprow) {
2085     mbs  = a->compressedrow.nrows;
2086     ii   = a->compressedrow.i;
2087     ridx = a->compressedrow.rindex;
2088   } else {
2089     mbs = a->mbs;
2090     ii  = a->i;
2091     z   = zarray;
2092   }
2093 
2094   if (!a->mult_work) {
2095     k    = PetscMax(A->rmap->n,A->cmap->n);
2096     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2097   }
2098   work = a->mult_work;
2099   for (i=0; i<mbs; i++) {
2100     n     = ii[1] - ii[0]; ii++;
2101     ncols = n*bs;
2102     workt = work;
2103     for (j=0; j<n; j++) {
2104       xb = x + bs*(*idx++);
2105       for (k=0; k<bs; k++) workt[k] = xb[k];
2106       workt += bs;
2107     }
2108     if (usecprow) z = zarray + bs*ridx[i];
2109     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2110     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
2111     v += n*bs2;
2112     if (!usecprow) z += bs;
2113   }
2114   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2115   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2116   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
2117   PetscFunctionReturn(0);
2118 }
2119 
2120 #undef __FUNCT__
2121 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
2122 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2123 {
2124   PetscScalar    zero = 0.0;
2125   PetscErrorCode ierr;
2126 
2127   PetscFunctionBegin;
2128   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2129   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2130   PetscFunctionReturn(0);
2131 }
2132 
2133 #undef __FUNCT__
2134 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
2135 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2136 {
2137   PetscScalar    zero = 0.0;
2138   PetscErrorCode ierr;
2139 
2140   PetscFunctionBegin;
2141   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2142   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2143   PetscFunctionReturn(0);
2144 }
2145 
2146 #undef __FUNCT__
2147 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
2148 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2149 {
2150   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2151   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2152   const PetscScalar *x,*xb = NULL;
2153   const MatScalar   *v;
2154   PetscErrorCode    ierr;
2155   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2156   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2157   Mat_CompressedRow cprow = a->compressedrow;
2158   PetscBool         usecprow = cprow.use;
2159 
2160   PetscFunctionBegin;
2161   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2162   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2163   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2164 
2165   idx = a->j;
2166   v   = a->a;
2167   if (usecprow) {
2168     mbs  = cprow.nrows;
2169     ii   = cprow.i;
2170     ridx = cprow.rindex;
2171   } else {
2172     mbs=a->mbs;
2173     ii = a->i;
2174     xb = x;
2175   }
2176 
2177   switch (bs) {
2178   case 1:
2179     for (i=0; i<mbs; i++) {
2180       if (usecprow) xb = x + ridx[i];
2181       x1 = xb[0];
2182       ib = idx + ii[0];
2183       n  = ii[1] - ii[0]; ii++;
2184       for (j=0; j<n; j++) {
2185         rval     = ib[j];
2186         z[rval] += PetscConj(*v) * x1;
2187         v++;
2188       }
2189       if (!usecprow) xb++;
2190     }
2191     break;
2192   case 2:
2193     for (i=0; i<mbs; i++) {
2194       if (usecprow) xb = x + 2*ridx[i];
2195       x1 = xb[0]; x2 = xb[1];
2196       ib = idx + ii[0];
2197       n  = ii[1] - ii[0]; ii++;
2198       for (j=0; j<n; j++) {
2199         rval       = ib[j]*2;
2200         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2201         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2202         v         += 4;
2203       }
2204       if (!usecprow) xb += 2;
2205     }
2206     break;
2207   case 3:
2208     for (i=0; i<mbs; i++) {
2209       if (usecprow) xb = x + 3*ridx[i];
2210       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2211       ib = idx + ii[0];
2212       n  = ii[1] - ii[0]; ii++;
2213       for (j=0; j<n; j++) {
2214         rval       = ib[j]*3;
2215         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2216         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2217         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2218         v         += 9;
2219       }
2220       if (!usecprow) xb += 3;
2221     }
2222     break;
2223   case 4:
2224     for (i=0; i<mbs; i++) {
2225       if (usecprow) xb = x + 4*ridx[i];
2226       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2227       ib = idx + ii[0];
2228       n  = ii[1] - ii[0]; ii++;
2229       for (j=0; j<n; j++) {
2230         rval       = ib[j]*4;
2231         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2232         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2233         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2234         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2235         v         += 16;
2236       }
2237       if (!usecprow) xb += 4;
2238     }
2239     break;
2240   case 5:
2241     for (i=0; i<mbs; i++) {
2242       if (usecprow) xb = x + 5*ridx[i];
2243       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2244       x4 = xb[3]; x5 = xb[4];
2245       ib = idx + ii[0];
2246       n  = ii[1] - ii[0]; ii++;
2247       for (j=0; j<n; j++) {
2248         rval       = ib[j]*5;
2249         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2250         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2251         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2252         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2253         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2254         v         += 25;
2255       }
2256       if (!usecprow) xb += 5;
2257     }
2258     break;
2259   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
2260     PetscInt          ncols,k;
2261     PetscScalar       *work,*workt;
2262     const PetscScalar *xtmp;
2263 
2264     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2265     if (!a->mult_work) {
2266       k    = PetscMax(A->rmap->n,A->cmap->n);
2267       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2268     }
2269     work = a->mult_work;
2270     xtmp = x;
2271     for (i=0; i<mbs; i++) {
2272       n     = ii[1] - ii[0]; ii++;
2273       ncols = n*bs;
2274       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2275       if (usecprow) xtmp = x + bs*ridx[i];
2276       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2277       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2278       v += n*bs2;
2279       if (!usecprow) xtmp += bs;
2280       workt = work;
2281       for (j=0; j<n; j++) {
2282         zb = z + bs*(*idx++);
2283         for (k=0; k<bs; k++) zb[k] += workt[k] ;
2284         workt += bs;
2285       }
2286     }
2287     }
2288   }
2289   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2290   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2291   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2292   PetscFunctionReturn(0);
2293 }
2294 
2295 #undef __FUNCT__
2296 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
2297 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2298 {
2299   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2300   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2301   const PetscScalar *x,*xb = 0;
2302   const MatScalar   *v;
2303   PetscErrorCode    ierr;
2304   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2305   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2306   Mat_CompressedRow cprow   = a->compressedrow;
2307   PetscBool         usecprow=cprow.use;
2308 
2309   PetscFunctionBegin;
2310   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2311   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2312   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2313 
2314   idx = a->j;
2315   v   = a->a;
2316   if (usecprow) {
2317     mbs  = cprow.nrows;
2318     ii   = cprow.i;
2319     ridx = cprow.rindex;
2320   } else {
2321     mbs=a->mbs;
2322     ii = a->i;
2323     xb = x;
2324   }
2325 
2326   switch (bs) {
2327   case 1:
2328     for (i=0; i<mbs; i++) {
2329       if (usecprow) xb = x + ridx[i];
2330       x1 = xb[0];
2331       ib = idx + ii[0];
2332       n  = ii[1] - ii[0]; ii++;
2333       for (j=0; j<n; j++) {
2334         rval     = ib[j];
2335         z[rval] += *v * x1;
2336         v++;
2337       }
2338       if (!usecprow) xb++;
2339     }
2340     break;
2341   case 2:
2342     for (i=0; i<mbs; i++) {
2343       if (usecprow) xb = x + 2*ridx[i];
2344       x1 = xb[0]; x2 = xb[1];
2345       ib = idx + ii[0];
2346       n  = ii[1] - ii[0]; ii++;
2347       for (j=0; j<n; j++) {
2348         rval       = ib[j]*2;
2349         z[rval++] += v[0]*x1 + v[1]*x2;
2350         z[rval++] += v[2]*x1 + v[3]*x2;
2351         v         += 4;
2352       }
2353       if (!usecprow) xb += 2;
2354     }
2355     break;
2356   case 3:
2357     for (i=0; i<mbs; i++) {
2358       if (usecprow) xb = x + 3*ridx[i];
2359       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2360       ib = idx + ii[0];
2361       n  = ii[1] - ii[0]; ii++;
2362       for (j=0; j<n; j++) {
2363         rval       = ib[j]*3;
2364         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2365         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2366         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2367         v         += 9;
2368       }
2369       if (!usecprow) xb += 3;
2370     }
2371     break;
2372   case 4:
2373     for (i=0; i<mbs; i++) {
2374       if (usecprow) xb = x + 4*ridx[i];
2375       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2376       ib = idx + ii[0];
2377       n  = ii[1] - ii[0]; ii++;
2378       for (j=0; j<n; j++) {
2379         rval       = ib[j]*4;
2380         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2381         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2382         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2383         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2384         v         += 16;
2385       }
2386       if (!usecprow) xb += 4;
2387     }
2388     break;
2389   case 5:
2390     for (i=0; i<mbs; i++) {
2391       if (usecprow) xb = x + 5*ridx[i];
2392       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2393       x4 = xb[3]; x5 = xb[4];
2394       ib = idx + ii[0];
2395       n  = ii[1] - ii[0]; ii++;
2396       for (j=0; j<n; j++) {
2397         rval       = ib[j]*5;
2398         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2399         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2400         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2401         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2402         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2403         v         += 25;
2404       }
2405       if (!usecprow) xb += 5;
2406     }
2407     break;
2408   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2409     PetscInt          ncols,k;
2410     PetscScalar       *work,*workt;
2411     const PetscScalar *xtmp;
2412     if (!a->mult_work) {
2413       k    = PetscMax(A->rmap->n,A->cmap->n);
2414       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2415     }
2416     work = a->mult_work;
2417     xtmp = x;
2418     for (i=0; i<mbs; i++) {
2419       n     = ii[1] - ii[0]; ii++;
2420       ncols = n*bs;
2421       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2422       if (usecprow) xtmp = x + bs*ridx[i];
2423       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2424       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2425       v += n*bs2;
2426       if (!usecprow) xtmp += bs;
2427       workt = work;
2428       for (j=0; j<n; j++) {
2429         zb = z + bs*(*idx++);
2430         for (k=0; k<bs; k++) zb[k] += workt[k];
2431         workt += bs;
2432       }
2433     }
2434     }
2435   }
2436   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2437   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2438   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2439   PetscFunctionReturn(0);
2440 }
2441 
2442 #undef __FUNCT__
2443 #define __FUNCT__ "MatScale_SeqBAIJ"
2444 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2445 {
2446   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2447   PetscInt       totalnz = a->bs2*a->nz;
2448   PetscScalar    oalpha  = alpha;
2449   PetscErrorCode ierr;
2450   PetscBLASInt   one = 1,tnz;
2451 
2452   PetscFunctionBegin;
2453   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2454   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2455   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2456   PetscFunctionReturn(0);
2457 }
2458 
2459 #undef __FUNCT__
2460 #define __FUNCT__ "MatNorm_SeqBAIJ"
2461 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2462 {
2463   PetscErrorCode ierr;
2464   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2465   MatScalar      *v  = a->a;
2466   PetscReal      sum = 0.0;
2467   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2468 
2469   PetscFunctionBegin;
2470   if (type == NORM_FROBENIUS) {
2471     for (i=0; i< bs2*nz; i++) {
2472       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2473     }
2474     *norm = PetscSqrtReal(sum);
2475   } else if (type == NORM_1) { /* maximum column sum */
2476     PetscReal *tmp;
2477     PetscInt  *bcol = a->j;
2478     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2479     for (i=0; i<nz; i++) {
2480       for (j=0; j<bs; j++) {
2481         k1 = bs*(*bcol) + j; /* column index */
2482         for (k=0; k<bs; k++) {
2483           tmp[k1] += PetscAbsScalar(*v); v++;
2484         }
2485       }
2486       bcol++;
2487     }
2488     *norm = 0.0;
2489     for (j=0; j<A->cmap->n; j++) {
2490       if (tmp[j] > *norm) *norm = tmp[j];
2491     }
2492     ierr = PetscFree(tmp);CHKERRQ(ierr);
2493   } else if (type == NORM_INFINITY) { /* maximum row sum */
2494     *norm = 0.0;
2495     for (k=0; k<bs; k++) {
2496       for (j=0; j<a->mbs; j++) {
2497         v   = a->a + bs2*a->i[j] + k;
2498         sum = 0.0;
2499         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2500           for (k1=0; k1<bs; k1++) {
2501             sum += PetscAbsScalar(*v);
2502             v   += bs;
2503           }
2504         }
2505         if (sum > *norm) *norm = sum;
2506       }
2507     }
2508   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2509   PetscFunctionReturn(0);
2510 }
2511 
2512 
2513 #undef __FUNCT__
2514 #define __FUNCT__ "MatEqual_SeqBAIJ"
2515 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2516 {
2517   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2518   PetscErrorCode ierr;
2519 
2520   PetscFunctionBegin;
2521   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2522   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2523     *flg = PETSC_FALSE;
2524     PetscFunctionReturn(0);
2525   }
2526 
2527   /* if the a->i are the same */
2528   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2529   if (!*flg) PetscFunctionReturn(0);
2530 
2531   /* if a->j are the same */
2532   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2533   if (!*flg) PetscFunctionReturn(0);
2534 
2535   /* if a->a are the same */
2536   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2537   PetscFunctionReturn(0);
2538 
2539 }
2540 
2541 #undef __FUNCT__
2542 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
2543 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2544 {
2545   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2546   PetscErrorCode ierr;
2547   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2548   PetscScalar    *x,zero = 0.0;
2549   MatScalar      *aa,*aa_j;
2550 
2551   PetscFunctionBegin;
2552   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2553   bs   = A->rmap->bs;
2554   aa   = a->a;
2555   ai   = a->i;
2556   aj   = a->j;
2557   ambs = a->mbs;
2558   bs2  = a->bs2;
2559 
2560   ierr = VecSet(v,zero);CHKERRQ(ierr);
2561   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2562   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2563   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2564   for (i=0; i<ambs; i++) {
2565     for (j=ai[i]; j<ai[i+1]; j++) {
2566       if (aj[j] == i) {
2567         row  = i*bs;
2568         aa_j = aa+j*bs2;
2569         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2570         break;
2571       }
2572     }
2573   }
2574   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2575   PetscFunctionReturn(0);
2576 }
2577 
2578 #undef __FUNCT__
2579 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2580 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2581 {
2582   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2583   const PetscScalar *l,*r,*li,*ri;
2584   PetscScalar       x;
2585   MatScalar         *aa, *v;
2586   PetscErrorCode    ierr;
2587   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2588   const PetscInt    *ai,*aj;
2589 
2590   PetscFunctionBegin;
2591   ai  = a->i;
2592   aj  = a->j;
2593   aa  = a->a;
2594   m   = A->rmap->n;
2595   n   = A->cmap->n;
2596   bs  = A->rmap->bs;
2597   mbs = a->mbs;
2598   bs2 = a->bs2;
2599   if (ll) {
2600     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2601     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2602     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2603     for (i=0; i<mbs; i++) { /* for each block row */
2604       M  = ai[i+1] - ai[i];
2605       li = l + i*bs;
2606       v  = aa + bs2*ai[i];
2607       for (j=0; j<M; j++) { /* for each block */
2608         for (k=0; k<bs2; k++) {
2609           (*v++) *= li[k%bs];
2610         }
2611       }
2612     }
2613     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2614     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2615   }
2616 
2617   if (rr) {
2618     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2619     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2620     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2621     for (i=0; i<mbs; i++) { /* for each block row */
2622       iai = ai[i];
2623       M   = ai[i+1] - iai;
2624       v   = aa + bs2*iai;
2625       for (j=0; j<M; j++) { /* for each block */
2626         ri = r + bs*aj[iai+j];
2627         for (k=0; k<bs; k++) {
2628           x = ri[k];
2629           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2630           v += bs;
2631         }
2632       }
2633     }
2634     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2635     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2636   }
2637   PetscFunctionReturn(0);
2638 }
2639 
2640 
2641 #undef __FUNCT__
2642 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2643 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2644 {
2645   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2646 
2647   PetscFunctionBegin;
2648   info->block_size   = a->bs2;
2649   info->nz_allocated = a->bs2*a->maxnz;
2650   info->nz_used      = a->bs2*a->nz;
2651   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2652   info->assemblies   = A->num_ass;
2653   info->mallocs      = A->info.mallocs;
2654   info->memory       = ((PetscObject)A)->mem;
2655   if (A->factortype) {
2656     info->fill_ratio_given  = A->info.fill_ratio_given;
2657     info->fill_ratio_needed = A->info.fill_ratio_needed;
2658     info->factor_mallocs    = A->info.factor_mallocs;
2659   } else {
2660     info->fill_ratio_given  = 0;
2661     info->fill_ratio_needed = 0;
2662     info->factor_mallocs    = 0;
2663   }
2664   PetscFunctionReturn(0);
2665 }
2666 
2667 
2668 #if defined(PETSC_THREADCOMM_ACTIVE)
2669 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A)
2670 {
2671   PetscErrorCode ierr;
2672   PetscInt       *trstarts=A->rmap->trstarts;
2673   PetscInt       n,start,end;
2674   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
2675 
2676   start = trstarts[thread_id]/A->rmap->bs;
2677   end   = trstarts[thread_id+1]/A->rmap->bs;
2678   n     = a->i[end] - a->i[start];
2679   ierr  = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
2680   return 0;
2681 }
2682 
2683 #undef __FUNCT__
2684 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2685 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2686 {
2687   PetscErrorCode ierr;
2688 
2689 
2690   PetscFunctionBegin;
2691   ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr);
2692   PetscFunctionReturn(0);
2693 }
2694 #else
2695 #undef __FUNCT__
2696 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2697 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2698 {
2699   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2700   PetscErrorCode ierr;
2701 
2702   PetscFunctionBegin;
2703   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2704   PetscFunctionReturn(0);
2705 }
2706 #endif
2707