xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 734794cf6305c19bac7ec37970b3aba1264ea7c2)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 #if defined(PETSC_THREADCOMM_ACTIVE)
7 #include <petscthreadcomm.h>
8 #endif
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatIncreaseOverlap_SeqBAIJ"
12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ_Private"
78 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
79 {
80   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
81   PetscErrorCode ierr;
82   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
83   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
84   const PetscInt *irow,*icol;
85   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
86   PetscInt       *aj = a->j,*ai = a->i;
87   MatScalar      *mat_a;
88   Mat            C;
89   PetscBool      flag;
90 
91   PetscFunctionBegin;
92 
93   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
95   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
96   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
97 
98   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
99   ssmap = smap;
100   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
101   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102   /* determine lens of each row */
103   for (i=0; i<nrows; i++) {
104     kstart  = ai[irow[i]];
105     kend    = kstart + a->ilen[irow[i]];
106     lens[i] = 0;
107     for (k=kstart; k<kend; k++) {
108       if (ssmap[aj[k]]) lens[i]++;
109     }
110   }
111   /* Create and fill new matrix */
112   if (scall == MAT_REUSE_MATRIX) {
113     c = (Mat_SeqBAIJ*)((*B)->data);
114 
115     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
116     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
117     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
118     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
119     C    = *B;
120   } else {
121     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
122     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
123     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
124     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);CHKERRQ(ierr);
125   }
126   c = (Mat_SeqBAIJ*)(C->data);
127   for (i=0; i<nrows; i++) {
128     row      = irow[i];
129     kstart   = ai[row];
130     kend     = kstart + a->ilen[row];
131     mat_i    = c->i[i];
132     mat_j    = c->j + mat_i;
133     mat_a    = c->a + mat_i*bs2;
134     mat_ilen = c->ilen + i;
135     for (k=kstart; k<kend; k++) {
136       if ((tcol=ssmap[a->j[k]])) {
137         *mat_j++ = tcol - 1;
138         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
139         mat_a   += bs2;
140         (*mat_ilen)++;
141       }
142     }
143   }
144   /* sort */
145   {
146     MatScalar *work;
147 
148     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
149     for (i=0; i<nrows; i++) {
150       PetscInt ilen;
151       mat_i = c->i[i];
152       mat_j = c->j + mat_i;
153       mat_a = c->a + mat_i*bs2;
154       ilen  = c->ilen[i];
155       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
156     }
157     ierr = PetscFree(work);CHKERRQ(ierr);
158   }
159 
160   /* Free work space */
161   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
162   ierr = PetscFree(smap);CHKERRQ(ierr);
163   ierr = PetscFree(lens);CHKERRQ(ierr);
164   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
165   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166 
167   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
168   *B   = C;
169   PetscFunctionReturn(0);
170 }
171 
172 #undef __FUNCT__
173 #define __FUNCT__ "MatGetSubMatrix_SeqBAIJ"
174 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
175 {
176   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
177   IS             is1,is2;
178   PetscErrorCode ierr;
179   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
180   const PetscInt *irow,*icol;
181 
182   PetscFunctionBegin;
183   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
184   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
185   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
186   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
187 
188   /* Verify if the indices corespond to each element in a block
189    and form the IS with compressed IS */
190   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
191   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
192   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
193   for (i=0; i<a->mbs; i++) {
194     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
195   }
196   count = 0;
197   for (i=0; i<nrows; i++) {
198     PetscInt j = irow[i] / bs;
199     if ((vary[j]--)==bs) iary[count++] = j;
200   }
201   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
202 
203   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
204   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
205   for (i=0; i<a->mbs; i++) {
206     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
207   }
208   count = 0;
209   for (i=0; i<ncols; i++) {
210     PetscInt j = icol[i] / bs;
211     if ((vary[j]--)==bs) iary[count++] = j;
212   }
213   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
214   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
215   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
216   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
217 
218   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
219   ierr = ISDestroy(&is1);CHKERRQ(ierr);
220   ierr = ISDestroy(&is2);CHKERRQ(ierr);
221   PetscFunctionReturn(0);
222 }
223 
224 #undef __FUNCT__
225 #define __FUNCT__ "MatGetSubMatrices_SeqBAIJ"
226 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
227 {
228   PetscErrorCode ierr;
229   PetscInt       i;
230 
231   PetscFunctionBegin;
232   if (scall == MAT_INITIAL_MATRIX) {
233     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
234   }
235 
236   for (i=0; i<n; i++) {
237     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
238   }
239   PetscFunctionReturn(0);
240 }
241 
242 
243 /* -------------------------------------------------------*/
244 /* Should check that shapes of vectors and matrices match */
245 /* -------------------------------------------------------*/
246 
247 #if defined(PETSC_THREADCOMM_ACTIVE)
248 PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
249 {
250   PetscErrorCode    ierr;
251   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
252   PetscScalar       *z;
253   const PetscScalar *x;
254   const MatScalar   *aa;
255   PetscInt          *trstarts=A->rmap->trstarts;
256   PetscInt          n,start,end,i;
257   const PetscInt    *aj,*ai;
258   PetscScalar       sum;
259 
260   ierr  = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
261   ierr  = VecGetArray(zz,&z);CHKERRQ(ierr);
262   start = trstarts[thread_id];
263   end   = trstarts[thread_id+1];
264   ai    = a->i;
265   for (i=start; i<end; i++) {
266     n   = ai[i+1] - ai[i];
267     aj  = a->j + ai[i];
268     aa  = a->a + ai[i];
269     sum = 0.0;
270     PetscSparseDensePlusDot(sum,x,aa,aj,n);
271     z[i] = sum;
272   }
273   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
274   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
275   return 0;
276 }
277 
278 #undef __FUNCT__
279 #define __FUNCT__ "MatMult_SeqBAIJ_1"
280 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
281 {
282   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
283   PetscScalar       *z,sum;
284   const PetscScalar *x;
285   const MatScalar   *v;
286   PetscErrorCode    ierr;
287   PetscInt          mbs,i,n;
288   const PetscInt    *idx,*ii,*ridx=NULL;
289   PetscBool         usecprow=a->compressedrow.use;
290 
291   PetscFunctionBegin;
292 
293   if (usecprow) {
294     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
295     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
296     mbs  = a->compressedrow.nrows;
297     ii   = a->compressedrow.i;
298     ridx = a->compressedrow.rindex;
299     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
300     for (i=0; i<mbs; i++) {
301       n   = ii[i+1] - ii[i];
302       v   = a->a + ii[i];
303       idx = a->j + ii[i];
304       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
305       PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
306       sum = 0.0;
307       PetscSparseDensePlusDot(sum,x,v,idx,n);
308       z[ridx[i]] = sum;
309     }
310     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
311     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
312   } else {
313     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);CHKERRQ(ierr);
314   }
315   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
316   PetscFunctionReturn(0);
317 }
318 #else
319 #undef __FUNCT__
320 #define __FUNCT__ "MatMult_SeqBAIJ_1"
321 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
322 {
323   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
324   PetscScalar       *z,sum;
325   const PetscScalar *x;
326   const MatScalar   *v;
327   PetscErrorCode    ierr;
328   PetscInt          mbs,i,n;
329   const PetscInt    *idx,*ii,*ridx=NULL;
330   PetscBool         usecprow=a->compressedrow.use;
331 
332   PetscFunctionBegin;
333   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
334   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
335 
336   if (usecprow) {
337     mbs  = a->compressedrow.nrows;
338     ii   = a->compressedrow.i;
339     ridx = a->compressedrow.rindex;
340     ierr = PetscMemzero(z,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
341   } else {
342     mbs = a->mbs;
343     ii  = a->i;
344   }
345 
346   for (i=0; i<mbs; i++) {
347     n   = ii[1] - ii[0];
348     v   = a->a + ii[0];
349     idx = a->j + ii[0];
350     ii++;
351     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
352     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
353     sum = 0.0;
354     PetscSparseDensePlusDot(sum,x,v,idx,n);
355     if (usecprow) {
356       z[ridx[i]] = sum;
357     } else {
358       z[i]        = sum;
359     }
360   }
361   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
362   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
363   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
364   PetscFunctionReturn(0);
365 }
366 #endif
367 
368 #if defined(PETSC_THREADCOMM_ACTIVE)
369 PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
370 {
371   PetscErrorCode    ierr;
372   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
373   PetscScalar       *z,x1,x2,sum1,sum2;
374   const PetscScalar *x,*xb;
375   const MatScalar   *aa;
376   PetscInt          *trstarts=A->rmap->trstarts;
377   PetscInt          n,start,end,i,j;
378   const PetscInt    *aj,*ai;
379 
380   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
381   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
382   start  = trstarts[thread_id] / 2;
383   end    = trstarts[thread_id+1] / 2;
384   ai     = a->i;
385   for (i=start; i<end; i++) {
386     n    = ai[i+1] - ai[i];
387     aj   = a->j + ai[i];
388     aa   = a->a + ai[i]*4;
389     sum1 = 0.0; sum2 = 0.0;
390     for (j=0; j<n; j++) {
391       xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1];
392       sum1 += aa[4*j]*x1   + aa[4*j+2]*x2;
393       sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2;
394     }
395     z[2*i] = sum1; z[2*i+1] = sum2;
396   }
397   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
398   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
399   return 0;
400 }
401 
402 #undef __FUNCT__
403 #define __FUNCT__ "MatMult_SeqBAIJ_2"
404 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
405 {
406   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
407   PetscScalar       *z,x1,x2,sum1,sum2;
408   const PetscScalar *x,*xb;
409   const MatScalar   *v;
410   PetscErrorCode    ierr;
411   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
412   PetscBool         usecprow=a->compressedrow.use;
413 
414   PetscFunctionBegin;
415 
416   if (usecprow) {
417     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
418     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
419     mbs  = a->compressedrow.nrows;
420     ii   = a->compressedrow.i;
421     ridx = a->compressedrow.rindex;
422     for (i=0; i<mbs; i++) {
423       n    = ii[i+1] - ii[i];
424       idx  = a->j + ii[i];
425       v    = a->a + ii[i]*4;
426       sum1 = 0.0; sum2 = 0.0;
427       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
428       PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
429       for (j=0; j<n; j++) {
430 	xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1];
431 	sum1 += v[4*j]*x1   + v[4*j+2]*x2;
432 	sum2 += v[4*j+1]*x1 + v[4*j+3]*x2;
433       }
434       z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2;
435     }
436     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
437     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
438   } else {
439     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);CHKERRQ(ierr);
440   }
441   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
442   PetscFunctionReturn(0);
443 }
444 #else
445 #undef __FUNCT__
446 #define __FUNCT__ "MatMult_SeqBAIJ_2"
447 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
448 {
449   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
450   PetscScalar       *z = 0,sum1,sum2,*zarray;
451   const PetscScalar *x,*xb;
452   PetscScalar       x1,x2;
453   const MatScalar   *v;
454   PetscErrorCode    ierr;
455   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
456   PetscBool         usecprow=a->compressedrow.use;
457 
458   PetscFunctionBegin;
459   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
460   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
461 
462   idx = a->j;
463   v   = a->a;
464   if (usecprow) {
465     mbs  = a->compressedrow.nrows;
466     ii   = a->compressedrow.i;
467     ridx = a->compressedrow.rindex;
468   } else {
469     mbs = a->mbs;
470     ii  = a->i;
471     z   = zarray;
472   }
473 
474   for (i=0; i<mbs; i++) {
475     n           = ii[1] - ii[0]; ii++;
476     sum1        = 0.0; sum2 = 0.0;
477     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
478     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
479     for (j=0; j<n; j++) {
480       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
481       sum1 += v[0]*x1 + v[2]*x2;
482       sum2 += v[1]*x1 + v[3]*x2;
483       v    += 4;
484     }
485     if (usecprow) z = zarray + 2*ridx[i];
486     z[0] = sum1; z[1] = sum2;
487     if (!usecprow) z += 2;
488   }
489   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
490   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
491   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
492   PetscFunctionReturn(0);
493 }
494 #endif
495 
496 #if defined(PETSC_THREADCOMM_ACTIVE)
497 PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
498 {
499   PetscErrorCode    ierr;
500   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
501   PetscScalar       *z,x1,x2,x3,sum1,sum2,sum3;
502   const PetscScalar *x,*xb;
503   const MatScalar   *aa;
504   PetscInt          *trstarts=A->rmap->trstarts;
505   PetscInt          n,start,end,i,j;
506   const PetscInt    *aj,*ai;
507 
508   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
509   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
510   start  = trstarts[thread_id] / 3;
511   end    = trstarts[thread_id+1] / 3;
512   ai     = a->i;
513   for (i=start; i<end; i++) {
514     n    = ai[i+1] - ai[i];
515     aj   = a->j + ai[i];
516     aa   = a->a + ai[i]*9;
517     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
518     for (j=0; j<n; j++) {
519       xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
520       sum1 += aa[9*j]*x1   + aa[9*j+3]*x2 + aa[9*j+6]*x3;
521       sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3;
522       sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3;
523     }
524     z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3;
525   }
526   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
527   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
528   return 0;
529 }
530 
531 #undef __FUNCT__
532 #define __FUNCT__ "MatMult_SeqBAIJ_3"
533 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
534 {
535   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
536   PetscScalar       *z,sum1,sum2,sum3,x1,x2,x3;
537   const PetscScalar *x,*xb;
538   const MatScalar   *v;
539   PetscErrorCode    ierr;
540   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
541   PetscBool         usecprow=a->compressedrow.use;
542 
543 
544 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
545 #pragma disjoint(*v,*z,*xb)
546 #endif
547 
548   PetscFunctionBegin;
549 
550   if (usecprow) {
551     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
552     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
553     mbs  = a->compressedrow.nrows;
554     ii   = a->compressedrow.i;
555     ridx = a->compressedrow.rindex;
556     for (i=0; i<mbs; i++) {
557       n    = ii[i+1] - ii[i];
558       idx  = a->j + ii[i];
559       v    = a->a + ii[i]*9;
560       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
561       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
562       PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
563       for (j=0; j<n; j++) {
564 	xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
565 	sum1 += v[9*j]*x1   + v[9*j+3]*x2 + v[9*j+6]*x3;
566 	sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3;
567 	sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3;
568       }
569       z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3;
570     }
571     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
572     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
573   } else {
574     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);CHKERRQ(ierr);
575   }
576   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
577   PetscFunctionReturn(0);
578 }
579 #else
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMult_SeqBAIJ_3"
582 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
583 {
584   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
585   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
586   const PetscScalar *x,*xb;
587   const MatScalar   *v;
588   PetscErrorCode    ierr;
589   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
590   PetscBool         usecprow=a->compressedrow.use;
591 
592 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
593 #pragma disjoint(*v,*z,*xb)
594 #endif
595 
596   PetscFunctionBegin;
597   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
598   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
599 
600   idx = a->j;
601   v   = a->a;
602   if (usecprow) {
603     mbs  = a->compressedrow.nrows;
604     ii   = a->compressedrow.i;
605     ridx = a->compressedrow.rindex;
606   } else {
607     mbs = a->mbs;
608     ii  = a->i;
609     z   = zarray;
610   }
611 
612   for (i=0; i<mbs; i++) {
613     n           = ii[1] - ii[0]; ii++;
614     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
615     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
616     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
617     for (j=0; j<n; j++) {
618       xb = x + 3*(*idx++);
619       x1 = xb[0];
620       x2 = xb[1];
621       x3 = xb[2];
622 
623       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
624       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
625       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
626       v    += 9;
627     }
628     if (usecprow) z = zarray + 3*ridx[i];
629     z[0] = sum1; z[1] = sum2; z[2] = sum3;
630     if (!usecprow) z += 3;
631   }
632   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
633   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
634   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
635   PetscFunctionReturn(0);
636 }
637 #endif
638 
639 #if defined(PETSC_THREADCOMM_ACTIVE)
640 PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
641 {
642   PetscErrorCode    ierr;
643   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
644   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
645   const PetscScalar *x,*xb;
646   const MatScalar   *aa;
647   PetscInt          *trstarts=A->rmap->trstarts;
648   PetscInt          n,start,end,i,j;
649   const PetscInt    *aj,*ai;
650 
651   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
652   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
653   start  = trstarts[thread_id] / 4;
654   end    = trstarts[thread_id+1] / 4;
655   ai     = a->i;
656   for (i=start; i<end; i++) {
657     n    = ai[i+1] - ai[i];
658     aj   = a->j + ai[i];
659     aa   = a->a + ai[i]*16;
660     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
661     for (j=0; j<n; j++) {
662       xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
663       sum1 += aa[16*j]*x1   + aa[16*j+4]*x2 + aa[16*j+8]*x3  + aa[16*j+12]*x4;
664       sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3  + aa[16*j+13]*x4;
665       sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4;
666       sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4;
667     }
668     z[4*i]   = sum1; z[4*i+1] = sum2;
669     z[4*i+2] = sum3; z[4*i+3] = sum4;
670   }
671   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
672   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
673   return 0;
674 }
675 
676 #undef __FUNCT__
677 #define __FUNCT__ "MatMult_SeqBAIJ_4"
678 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
679 {
680   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
681   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
682   const PetscScalar *x,*xb;
683   const MatScalar   *v;
684   PetscErrorCode    ierr;
685   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
686   PetscBool         usecprow=a->compressedrow.use;
687 
688   PetscFunctionBegin;
689 
690   if (usecprow) {
691     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
692     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
693     mbs  = a->compressedrow.nrows;
694     ii   = a->compressedrow.i;
695     ridx = a->compressedrow.rindex;
696     for (i=0; i<mbs; i++) {
697       n = ii[i+1] - ii[1];
698       idx  = a->j + ii[i];
699       v    = a->a + ii[i]*16;
700       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
701       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
702       PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
703       for (j=0; j<n; j++) {
704 	xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
705 	sum1 += v[16*j]*x1   + v[16*j+4]*x2 + v[16*j+8]*x3  + v[16*j+12]*x4;
706 	sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3  + v[16*j+13]*x4;
707 	sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4;
708 	sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4;
709       }
710       z[4*ridx[i]]   = sum1; z[4*ridx[i]+1] = sum2;
711       z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4;
712     }
713     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
714     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
715   } else {
716     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);CHKERRQ(ierr);
717   }
718   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
719   PetscFunctionReturn(0);
720 }
721 #else
722 #undef __FUNCT__
723 #define __FUNCT__ "MatMult_SeqBAIJ_4"
724 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
725 {
726   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
727   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
728   const PetscScalar *x,*xb;
729   const MatScalar   *v;
730   PetscErrorCode    ierr;
731   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
732   PetscBool         usecprow=a->compressedrow.use;
733 
734   PetscFunctionBegin;
735   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
736   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
737 
738   idx = a->j;
739   v   = a->a;
740   if (usecprow) {
741     mbs  = a->compressedrow.nrows;
742     ii   = a->compressedrow.i;
743     ridx = a->compressedrow.rindex;
744   } else {
745     mbs = a->mbs;
746     ii  = a->i;
747     z   = zarray;
748   }
749 
750   for (i=0; i<mbs; i++) {
751     n = ii[1] - ii[0];
752     ii++;
753     sum1 = 0.0;
754     sum2 = 0.0;
755     sum3 = 0.0;
756     sum4 = 0.0;
757 
758     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
759     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
760     for (j=0; j<n; j++) {
761       xb    = x + 4*(*idx++);
762       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
763       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
764       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
765       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
766       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
767       v    += 16;
768     }
769     if (usecprow) z = zarray + 4*ridx[i];
770     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
771     if (!usecprow) z += 4;
772   }
773   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
774   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
775   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
776   PetscFunctionReturn(0);
777 }
778 #endif
779 
780 #if defined(PETSC_THREADCOMM_ACTIVE)
781 PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
782 {
783   PetscErrorCode    ierr;
784   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
785   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
786   const PetscScalar *x,*xb;
787   const MatScalar   *aa;
788   PetscInt          *trstarts=A->rmap->trstarts;
789   PetscInt          n,start,end,i,j;
790   const PetscInt    *aj,*ai;
791 
792   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
793   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
794   start  = trstarts[thread_id] / 5;
795   end    = trstarts[thread_id+1] / 5;
796   ai     = a->i;
797   for (i=start; i<end; i++) {
798     n    = ai[i+1] - ai[i];
799     aj   = a->j + ai[i];
800     aa   = a->a + ai[i]*25;
801     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
802     for (j=0; j<n; j++) {
803       xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
804       sum1 += aa[25*j]*x1   + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5;
805       sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5;
806       sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5;
807       sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5;
808       sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5;
809     }
810     z[5*i]   = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3;
811     z[5*i+3] = sum4; z[5*i+4] = sum5;
812   }
813   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
814   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
815   return 0;
816 }
817 
818 #undef __FUNCT__
819 #define __FUNCT__ "MatMult_SeqBAIJ_5"
820 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
821 {
822   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
823   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
824   const PetscScalar *xb,*x;
825   const MatScalar   *v;
826   PetscErrorCode    ierr;
827   const PetscInt    *idx,*ii,*ridx=NULL;
828   PetscInt          mbs,i,j,n;
829   PetscBool         usecprow=a->compressedrow.use;
830 
831   PetscFunctionBegin;
832 
833   if (usecprow) {
834     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
835     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
836     mbs  = a->compressedrow.nrows;
837     ii   = a->compressedrow.i;
838     ridx = a->compressedrow.rindex;
839     for (i=0; i<mbs; i++) {
840       n    = ii[i+1] - ii[i];
841       idx  = a->j + ii[i];
842       v    = a->a + ii[i]*25;
843       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
844       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
845       PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
846       for (j=0; j<n; j++) {
847 	xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
848 	sum1 += v[25*j]*x1   + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5;
849 	sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5;
850 	sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5;
851 	sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5;
852 	sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5;
853       }
854       z[5*ridx[i]]   = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3;
855       z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5;
856     }
857     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
858     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
859   } else {
860     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);CHKERRQ(ierr);
861   }
862   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
863   PetscFunctionReturn(0);
864 }
865 #else
866 #undef __FUNCT__
867 #define __FUNCT__ "MatMult_SeqBAIJ_5"
868 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
869 {
870   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
871   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
872   const PetscScalar *xb,*x;
873   const MatScalar   *v;
874   PetscErrorCode    ierr;
875   const PetscInt    *idx,*ii,*ridx=NULL;
876   PetscInt          mbs,i,j,n;
877   PetscBool         usecprow=a->compressedrow.use;
878 
879   PetscFunctionBegin;
880   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
881   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
882 
883   idx = a->j;
884   v   = a->a;
885   if (usecprow) {
886     mbs  = a->compressedrow.nrows;
887     ii   = a->compressedrow.i;
888     ridx = a->compressedrow.rindex;
889   } else {
890     mbs = a->mbs;
891     ii  = a->i;
892     z   = zarray;
893   }
894 
895   for (i=0; i<mbs; i++) {
896     n           = ii[1] - ii[0]; ii++;
897     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
898     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
899     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
900     for (j=0; j<n; j++) {
901       xb    = x + 5*(*idx++);
902       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
903       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
904       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
905       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
906       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
907       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
908       v    += 25;
909     }
910     if (usecprow) z = zarray + 5*ridx[i];
911     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
912     if (!usecprow) z += 5;
913   }
914   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
915   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
916   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
917   PetscFunctionReturn(0);
918 }
919 #endif
920 
921 
922 #if defined(PETSC_THREADCOMM_ACTIVE)
923 PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
924 {
925   PetscErrorCode    ierr;
926   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
927   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
928   const PetscScalar *x,*xb;
929   const MatScalar   *aa;
930   PetscInt          *trstarts=A->rmap->trstarts;
931   PetscInt          n,start,end,i,j;
932   const PetscInt    *aj,*ai;
933 
934   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
935   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
936   start  = trstarts[thread_id] / 6;
937   end    = trstarts[thread_id+1] / 6;
938   ai     = a->i;
939   for (i=start; i<end; i++) {
940     n    = ai[i+1] - ai[i];
941     aj   = a->j + ai[i];
942     aa   = a->a + ai[i]*36;
943     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
944     for (j=0; j<n; j++) {
945       xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
946       sum1 += aa[36*j]*x1   + aa[36*j+6]*x2  + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6;
947       sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2  + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6;
948       sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2  + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6;
949       sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2  + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6;
950       sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6;
951       sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6;
952     }
953     z[6*i]   = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3;
954     z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6;
955   }
956   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
957   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
958   return 0;
959 }
960 
961 #undef __FUNCT__
962 #define __FUNCT__ "MatMult_SeqBAIJ_6"
963 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
964 {
965   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
966   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
967   const PetscScalar *x,*xb;
968   const MatScalar   *v;
969   PetscErrorCode    ierr;
970   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
971   PetscBool         usecprow=a->compressedrow.use;
972 
973   PetscFunctionBegin;
974 
975   if (usecprow) {
976     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
977     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
978     mbs  = a->compressedrow.nrows;
979     ii   = a->compressedrow.i;
980     ridx = a->compressedrow.rindex;
981     for (i=0; i<mbs; i++) {
982       n  = ii[i+1] - ii[i];
983       idx  = a->j + ii[i];
984       v    = a->a + ii[i]*36;
985       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
986       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
987       PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
988       for (j=0; j<n; j++) {
989 	xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
990 	sum1 += v[36*j]*x1   + v[36*j+6]*x2  + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6;
991 	sum2 += v[36*j+1]*x1 + v[36*j+7]*x2  + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6;
992 	sum3 += v[36*j+2]*x1 + v[36*j+8]*x2  + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6;
993 	sum4 += v[36*j+3]*x1 + v[36*j+9]*x2  + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6;
994 	sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6;
995 	sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6;
996       }
997       z[6*ridx[i]]   = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3;
998       z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6;
999     }
1000     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1001     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1002   } else {
1003     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1004   }
1005   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
1006   PetscFunctionReturn(0);
1007 }
1008 #else
1009 #undef __FUNCT__
1010 #define __FUNCT__ "MatMult_SeqBAIJ_6"
1011 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
1012 {
1013   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1014   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1015   const PetscScalar *x,*xb;
1016   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
1017   const MatScalar   *v;
1018   PetscErrorCode    ierr;
1019   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1020   PetscBool         usecprow=a->compressedrow.use;
1021 
1022   PetscFunctionBegin;
1023   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1024   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1025 
1026   idx = a->j;
1027   v   = a->a;
1028   if (usecprow) {
1029     mbs  = a->compressedrow.nrows;
1030     ii   = a->compressedrow.i;
1031     ridx = a->compressedrow.rindex;
1032   } else {
1033     mbs = a->mbs;
1034     ii  = a->i;
1035     z   = zarray;
1036   }
1037 
1038   for (i=0; i<mbs; i++) {
1039     n  = ii[1] - ii[0];
1040     ii++;
1041     sum1 = 0.0;
1042     sum2 = 0.0;
1043     sum3 = 0.0;
1044     sum4 = 0.0;
1045     sum5 = 0.0;
1046     sum6 = 0.0;
1047 
1048     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1049     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1050     for (j=0; j<n; j++) {
1051       xb    = x + 6*(*idx++);
1052       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1053       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1054       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1055       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1056       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1057       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1058       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1059       v    += 36;
1060     }
1061     if (usecprow) z = zarray + 6*ridx[i];
1062     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1063     if (!usecprow) z += 6;
1064   }
1065 
1066   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1067   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1068   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
1069   PetscFunctionReturn(0);
1070 }
1071 #endif
1072 
1073 #if defined(PETSC_THREADCOMM_ACTIVE)
1074 PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
1075 {
1076   PetscErrorCode    ierr;
1077   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1078   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1079   const PetscScalar *x,*xb;
1080   const MatScalar   *aa;
1081   PetscInt          *trstarts=A->rmap->trstarts;
1082   PetscInt          n,start,end,i,j;
1083   const PetscInt    *aj,*ai;
1084 
1085   ierr   = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1086   ierr   = VecGetArray(zz,&z);CHKERRQ(ierr);
1087   start  = trstarts[thread_id] / 7;
1088   end    = trstarts[thread_id+1] / 7;
1089   ai     = a->i;
1090   for (i=start; i<end; i++) {
1091     n    = ai[i+1] - ai[i];
1092     aj   = a->j + ai[i];
1093     aa   = a->a + ai[i]*49;
1094     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1095     for (j=0; j<n; j++) {
1096       xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1097       sum1 += aa[49*j]*x1   + aa[49*j+7]*x2  + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7;
1098       sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2  + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7;
1099       sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2  + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7;
1100       sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7;
1101       sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7;
1102       sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7;
1103       sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7;
1104     }
1105     z[7*i]   = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4;
1106     z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7;
1107   }
1108   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1109   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1110   return 0;
1111 }
1112 
1113 #undef __FUNCT__
1114 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1115 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1116 {
1117   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1118   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1119   const PetscScalar *x,*xb;
1120   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1121   const MatScalar   *v;
1122   PetscErrorCode    ierr;
1123   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1124   PetscBool         usecprow=a->compressedrow.use;
1125 
1126   PetscFunctionBegin;
1127 
1128   if (usecprow) {
1129     ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1130     ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1131     mbs  = a->compressedrow.nrows;
1132     ii   = a->compressedrow.i;
1133     ridx = a->compressedrow.rindex;
1134     for (i=0; i<mbs; i++) {
1135       n    = ii[i+1] - ii[i];
1136       idx  = a->j + ii[i];
1137       v    = a->a + ii[i]*49;
1138       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1139       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1140       PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1141       for (j=0; j<n; j++) {
1142 	xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1143 	sum1 += v[49*j]*x1   + v[49*j+7]*x2  + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7;
1144 	sum2 += v[49*j+1]*x1 + v[49*j+8]*x2  + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7;
1145 	sum3 += v[49*j+2]*x1 + v[49*j+9]*x2  + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7;
1146 	sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7;
1147 	sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7;
1148 	sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7;
1149 	sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7;
1150       }
1151       z[7*ridx[i]]   = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4;
1152       z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7;
1153     }
1154     ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1155     ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1156   } else {
1157     ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);CHKERRQ(ierr);
1158   }
1159   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1160   PetscFunctionReturn(0);
1161 }
1162 #else
1163 #undef __FUNCT__
1164 #define __FUNCT__ "MatMult_SeqBAIJ_7"
1165 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1166 {
1167   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1168   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1169   const PetscScalar *x,*xb;
1170   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1171   const MatScalar   *v;
1172   PetscErrorCode    ierr;
1173   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1174   PetscBool         usecprow=a->compressedrow.use;
1175 
1176   PetscFunctionBegin;
1177   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1178   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1179 
1180   idx = a->j;
1181   v   = a->a;
1182   if (usecprow) {
1183     mbs  = a->compressedrow.nrows;
1184     ii   = a->compressedrow.i;
1185     ridx = a->compressedrow.rindex;
1186   } else {
1187     mbs = a->mbs;
1188     ii  = a->i;
1189     z   = zarray;
1190   }
1191 
1192   for (i=0; i<mbs; i++) {
1193     n  = ii[1] - ii[0];
1194     ii++;
1195     sum1 = 0.0;
1196     sum2 = 0.0;
1197     sum3 = 0.0;
1198     sum4 = 0.0;
1199     sum5 = 0.0;
1200     sum6 = 0.0;
1201     sum7 = 0.0;
1202 
1203     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1204     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1205     for (j=0; j<n; j++) {
1206       xb    = x + 7*(*idx++);
1207       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1208       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1209       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1210       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1211       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1212       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1213       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1214       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1215       v    += 49;
1216     }
1217     if (usecprow) z = zarray + 7*ridx[i];
1218     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1219     if (!usecprow) z += 7;
1220   }
1221 
1222   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1223   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1224   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
1225   PetscFunctionReturn(0);
1226 }
1227 #endif
1228 
1229 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1230 /* Default MatMult for block size 15 */
1231 
1232 #undef __FUNCT__
1233 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver1"
1234 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1235 {
1236   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1237   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1238   const PetscScalar *x,*xb;
1239   PetscScalar       *zarray,xv;
1240   const MatScalar   *v;
1241   PetscErrorCode    ierr;
1242   const PetscInt    *ii,*ij=a->j,*idx;
1243   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1244   PetscBool         usecprow=a->compressedrow.use;
1245 
1246   PetscFunctionBegin;
1247   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1248   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1249 
1250   v = a->a;
1251   if (usecprow) {
1252     mbs  = a->compressedrow.nrows;
1253     ii   = a->compressedrow.i;
1254     ridx = a->compressedrow.rindex;
1255   } else {
1256     mbs = a->mbs;
1257     ii  = a->i;
1258     z   = zarray;
1259   }
1260 
1261   for (i=0; i<mbs; i++) {
1262     n    = ii[i+1] - ii[i];
1263     idx  = ij + ii[i];
1264     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1265     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1266 
1267     for (j=0; j<n; j++) {
1268       xb = x + 15*(idx[j]);
1269 
1270       for (k=0; k<15; k++) {
1271         xv     =  xb[k];
1272         sum1  += v[0]*xv;
1273         sum2  += v[1]*xv;
1274         sum3  += v[2]*xv;
1275         sum4  += v[3]*xv;
1276         sum5  += v[4]*xv;
1277         sum6  += v[5]*xv;
1278         sum7  += v[6]*xv;
1279         sum8  += v[7]*xv;
1280         sum9  += v[8]*xv;
1281         sum10 += v[9]*xv;
1282         sum11 += v[10]*xv;
1283         sum12 += v[11]*xv;
1284         sum13 += v[12]*xv;
1285         sum14 += v[13]*xv;
1286         sum15 += v[14]*xv;
1287         v     += 15;
1288       }
1289     }
1290     if (usecprow) z = zarray + 15*ridx[i];
1291     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1292     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1293 
1294     if (!usecprow) z += 15;
1295   }
1296 
1297   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1298   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1299   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1300   PetscFunctionReturn(0);
1301 }
1302 
1303 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1304 #undef __FUNCT__
1305 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver2"
1306 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1307 {
1308   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1309   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1310   const PetscScalar *x,*xb;
1311   PetscScalar       x1,x2,x3,x4,*zarray;
1312   const MatScalar   *v;
1313   PetscErrorCode    ierr;
1314   const PetscInt    *ii,*ij=a->j,*idx;
1315   PetscInt          mbs,i,j,n,*ridx=NULL;
1316   PetscBool         usecprow=a->compressedrow.use;
1317 
1318   PetscFunctionBegin;
1319   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1320   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1321 
1322   v = a->a;
1323   if (usecprow) {
1324     mbs  = a->compressedrow.nrows;
1325     ii   = a->compressedrow.i;
1326     ridx = a->compressedrow.rindex;
1327   } else {
1328     mbs = a->mbs;
1329     ii  = a->i;
1330     z   = zarray;
1331   }
1332 
1333   for (i=0; i<mbs; i++) {
1334     n    = ii[i+1] - ii[i];
1335     idx  = ij + ii[i];
1336     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1337     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1338 
1339     for (j=0; j<n; j++) {
1340       xb = x + 15*(idx[j]);
1341       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1342 
1343       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1344       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1345       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1346       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1347       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1348       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1349       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1350       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1351       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1352       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1353       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1354       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1355       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1356       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1357       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1358 
1359       v += 60;
1360 
1361       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1362 
1363       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1364       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1365       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1366       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1367       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1368       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1369       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1370       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1371       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1372       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1373       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1374       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1375       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1376       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1377       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1378       v     += 60;
1379 
1380       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1381       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1382       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1383       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1384       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1385       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1386       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1387       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1388       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1389       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1390       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1391       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1392       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1393       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1394       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1395       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1396       v     += 60;
1397 
1398       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1399       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1400       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1401       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1402       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1403       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1404       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1405       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1406       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1407       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1408       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1409       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1410       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1411       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1412       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1413       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1414       v     += 45;
1415     }
1416     if (usecprow) z = zarray + 15*ridx[i];
1417     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1418     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1419 
1420     if (!usecprow) z += 15;
1421   }
1422 
1423   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1424   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1425   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1430 #undef __FUNCT__
1431 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver3"
1432 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1433 {
1434   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1435   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1436   const PetscScalar *x,*xb;
1437   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1438   const MatScalar   *v;
1439   PetscErrorCode    ierr;
1440   const PetscInt    *ii,*ij=a->j,*idx;
1441   PetscInt          mbs,i,j,n,*ridx=NULL;
1442   PetscBool         usecprow=a->compressedrow.use;
1443 
1444   PetscFunctionBegin;
1445   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1446   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1447 
1448   v = a->a;
1449   if (usecprow) {
1450     mbs  = a->compressedrow.nrows;
1451     ii   = a->compressedrow.i;
1452     ridx = a->compressedrow.rindex;
1453   } else {
1454     mbs = a->mbs;
1455     ii  = a->i;
1456     z   = zarray;
1457   }
1458 
1459   for (i=0; i<mbs; i++) {
1460     n    = ii[i+1] - ii[i];
1461     idx  = ij + ii[i];
1462     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1463     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1464 
1465     for (j=0; j<n; j++) {
1466       xb = x + 15*(idx[j]);
1467       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1468       x8 = xb[7];
1469 
1470       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1471       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1472       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1473       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1474       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1475       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1476       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1477       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1478       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1479       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1480       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1481       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1482       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1483       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1484       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1485       v     += 120;
1486 
1487       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1488 
1489       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1490       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1491       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1492       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1493       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1494       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1495       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1496       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1497       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1498       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1499       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1500       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1501       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1502       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1503       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1504       v     += 105;
1505     }
1506     if (usecprow) z = zarray + 15*ridx[i];
1507     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1508     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1509 
1510     if (!usecprow) z += 15;
1511   }
1512 
1513   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1514   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1515   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1516   PetscFunctionReturn(0);
1517 }
1518 
1519 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1520 
1521 #undef __FUNCT__
1522 #define __FUNCT__ "MatMult_SeqBAIJ_15_ver4"
1523 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1524 {
1525   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1526   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1527   const PetscScalar *x,*xb;
1528   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1529   const MatScalar   *v;
1530   PetscErrorCode    ierr;
1531   const PetscInt    *ii,*ij=a->j,*idx;
1532   PetscInt          mbs,i,j,n,*ridx=NULL;
1533   PetscBool         usecprow=a->compressedrow.use;
1534 
1535   PetscFunctionBegin;
1536   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1537   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1538 
1539   v = a->a;
1540   if (usecprow) {
1541     mbs  = a->compressedrow.nrows;
1542     ii   = a->compressedrow.i;
1543     ridx = a->compressedrow.rindex;
1544   } else {
1545     mbs = a->mbs;
1546     ii  = a->i;
1547     z   = zarray;
1548   }
1549 
1550   for (i=0; i<mbs; i++) {
1551     n    = ii[i+1] - ii[i];
1552     idx  = ij + ii[i];
1553     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1554     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1555 
1556     for (j=0; j<n; j++) {
1557       xb = x + 15*(idx[j]);
1558       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1559       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1560 
1561       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1562       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1563       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1564       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1565       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1566       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1567       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1568       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1569       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1570       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1571       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1572       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1573       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1574       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1575       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1576       v     += 225;
1577     }
1578     if (usecprow) z = zarray + 15*ridx[i];
1579     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1580     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1581 
1582     if (!usecprow) z += 15;
1583   }
1584 
1585   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1586   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1587   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1588   PetscFunctionReturn(0);
1589 }
1590 
1591 
1592 /*
1593     This will not work with MatScalar == float because it calls the BLAS
1594 */
1595 #undef __FUNCT__
1596 #define __FUNCT__ "MatMult_SeqBAIJ_N"
1597 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1598 {
1599   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1600   PetscScalar       *z = 0,*work,*workt,*zarray;
1601   const PetscScalar *x,*xb;
1602   const MatScalar   *v;
1603   PetscErrorCode    ierr;
1604   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1605   const PetscInt    *idx,*ii,*ridx=NULL;
1606   PetscInt          ncols,k;
1607   PetscBool         usecprow=a->compressedrow.use;
1608 
1609   PetscFunctionBegin;
1610   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1611   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1612 
1613   idx = a->j;
1614   v   = a->a;
1615   if (usecprow) {
1616     mbs  = a->compressedrow.nrows;
1617     ii   = a->compressedrow.i;
1618     ridx = a->compressedrow.rindex;
1619   } else {
1620     mbs = a->mbs;
1621     ii  = a->i;
1622     z   = zarray;
1623   }
1624 
1625   if (!a->mult_work) {
1626     k    = PetscMax(A->rmap->n,A->cmap->n);
1627     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1628   }
1629   work = a->mult_work;
1630   for (i=0; i<mbs; i++) {
1631     n           = ii[1] - ii[0]; ii++;
1632     ncols       = n*bs;
1633     workt       = work;
1634     for (j=0; j<n; j++) {
1635       xb = x + bs*(*idx++);
1636       for (k=0; k<bs; k++) workt[k] = xb[k];
1637       workt += bs;
1638     }
1639     if (usecprow) z = zarray + bs*ridx[i];
1640     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1641     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1642     v += n*bs2;
1643     if (!usecprow) z += bs;
1644   }
1645   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1646   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1647   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1648   PetscFunctionReturn(0);
1649 }
1650 
1651 #undef __FUNCT__
1652 #define __FUNCT__ "MatMultAdd_SeqBAIJ_1"
1653 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1654 {
1655   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1656   const PetscScalar *x;
1657   PetscScalar       *y,*z,sum;
1658   const MatScalar   *v;
1659   PetscErrorCode    ierr;
1660   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1661   const PetscInt    *idx,*ii;
1662   PetscBool         usecprow=a->compressedrow.use;
1663 
1664   PetscFunctionBegin;
1665   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1666   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1667 
1668   idx = a->j;
1669   v   = a->a;
1670   if (usecprow) {
1671     if (zz != yy) {
1672       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1673     }
1674     mbs  = a->compressedrow.nrows;
1675     ii   = a->compressedrow.i;
1676     ridx = a->compressedrow.rindex;
1677   } else {
1678     ii = a->i;
1679   }
1680 
1681   for (i=0; i<mbs; i++) {
1682     n = ii[1] - ii[0];
1683     ii++;
1684     if (!usecprow) {
1685       sum         = y[i];
1686     } else {
1687       sum = y[ridx[i]];
1688     }
1689     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1690     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1691     PetscSparseDensePlusDot(sum,x,v,idx,n);
1692     v   += n;
1693     idx += n;
1694     if (usecprow) {
1695       z[ridx[i]] = sum;
1696     } else {
1697       z[i] = sum;
1698     }
1699   }
1700   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1701   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1702   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1703   PetscFunctionReturn(0);
1704 }
1705 
1706 #undef __FUNCT__
1707 #define __FUNCT__ "MatMultAdd_SeqBAIJ_2"
1708 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1709 {
1710   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1711   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1712   const PetscScalar *x,*xb;
1713   PetscScalar       x1,x2,*yarray,*zarray;
1714   const MatScalar   *v;
1715   PetscErrorCode    ierr;
1716   PetscInt          mbs = a->mbs,i,n,j;
1717   const PetscInt    *idx,*ii,*ridx = NULL;
1718   PetscBool         usecprow = a->compressedrow.use;
1719 
1720   PetscFunctionBegin;
1721   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1722   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1723 
1724   idx = a->j;
1725   v   = a->a;
1726   if (usecprow) {
1727     if (zz != yy) {
1728       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1729     }
1730     mbs  = a->compressedrow.nrows;
1731     ii   = a->compressedrow.i;
1732     ridx = a->compressedrow.rindex;
1733     if (zz != yy) {
1734       ierr = PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1735     }
1736   } else {
1737     ii = a->i;
1738     y  = yarray;
1739     z  = zarray;
1740   }
1741 
1742   for (i=0; i<mbs; i++) {
1743     n = ii[1] - ii[0]; ii++;
1744     if (usecprow) {
1745       z = zarray + 2*ridx[i];
1746       y = yarray + 2*ridx[i];
1747     }
1748     sum1 = y[0]; sum2 = y[1];
1749     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1750     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1751     for (j=0; j<n; j++) {
1752       xb = x + 2*(*idx++);
1753       x1 = xb[0];
1754       x2 = xb[1];
1755 
1756       sum1 += v[0]*x1 + v[2]*x2;
1757       sum2 += v[1]*x1 + v[3]*x2;
1758       v    += 4;
1759     }
1760     z[0] = sum1; z[1] = sum2;
1761     if (!usecprow) {
1762       z += 2; y += 2;
1763     }
1764   }
1765   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1766   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1767   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1768   PetscFunctionReturn(0);
1769 }
1770 
1771 #undef __FUNCT__
1772 #define __FUNCT__ "MatMultAdd_SeqBAIJ_3"
1773 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1774 {
1775   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1776   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1777   const PetscScalar *x,*xb;
1778   const MatScalar   *v;
1779   PetscErrorCode    ierr;
1780   PetscInt          mbs = a->mbs,i,j,n;
1781   const PetscInt    *idx,*ii,*ridx = NULL;
1782   PetscBool         usecprow = a->compressedrow.use;
1783 
1784   PetscFunctionBegin;
1785   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1786   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1787 
1788   idx = a->j;
1789   v   = a->a;
1790   if (usecprow) {
1791     if (zz != yy) {
1792       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1793     }
1794     mbs  = a->compressedrow.nrows;
1795     ii   = a->compressedrow.i;
1796     ridx = a->compressedrow.rindex;
1797   } else {
1798     ii = a->i;
1799     y  = yarray;
1800     z  = zarray;
1801   }
1802 
1803   for (i=0; i<mbs; i++) {
1804     n = ii[1] - ii[0]; ii++;
1805     if (usecprow) {
1806       z = zarray + 3*ridx[i];
1807       y = yarray + 3*ridx[i];
1808     }
1809     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1810     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1811     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1812     for (j=0; j<n; j++) {
1813       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1814       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1815       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1816       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1817       v    += 9;
1818     }
1819     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1820     if (!usecprow) {
1821       z += 3; y += 3;
1822     }
1823   }
1824   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1825   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1826   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1827   PetscFunctionReturn(0);
1828 }
1829 
1830 #undef __FUNCT__
1831 #define __FUNCT__ "MatMultAdd_SeqBAIJ_4"
1832 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1833 {
1834   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1835   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1836   const PetscScalar *x,*xb;
1837   const MatScalar   *v;
1838   PetscErrorCode    ierr;
1839   PetscInt          mbs = a->mbs,i,j,n;
1840   const PetscInt    *idx,*ii,*ridx=NULL;
1841   PetscBool         usecprow=a->compressedrow.use;
1842 
1843   PetscFunctionBegin;
1844   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1845   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1846 
1847   idx = a->j;
1848   v   = a->a;
1849   if (usecprow) {
1850     if (zz != yy) {
1851       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1852     }
1853     mbs  = a->compressedrow.nrows;
1854     ii   = a->compressedrow.i;
1855     ridx = a->compressedrow.rindex;
1856   } else {
1857     ii = a->i;
1858     y  = yarray;
1859     z  = zarray;
1860   }
1861 
1862   for (i=0; i<mbs; i++) {
1863     n = ii[1] - ii[0]; ii++;
1864     if (usecprow) {
1865       z = zarray + 4*ridx[i];
1866       y = yarray + 4*ridx[i];
1867     }
1868     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1869     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1870     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1871     for (j=0; j<n; j++) {
1872       xb    = x + 4*(*idx++);
1873       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1874       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1875       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1876       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1877       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1878       v    += 16;
1879     }
1880     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1881     if (!usecprow) {
1882       z += 4; y += 4;
1883     }
1884   }
1885   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1886   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1887   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1888   PetscFunctionReturn(0);
1889 }
1890 
1891 #undef __FUNCT__
1892 #define __FUNCT__ "MatMultAdd_SeqBAIJ_5"
1893 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1894 {
1895   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1896   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1897   const PetscScalar *x,*xb;
1898   PetscScalar       *yarray,*zarray;
1899   const MatScalar   *v;
1900   PetscErrorCode    ierr;
1901   PetscInt          mbs = a->mbs,i,j,n;
1902   const PetscInt    *idx,*ii,*ridx = NULL;
1903   PetscBool         usecprow=a->compressedrow.use;
1904 
1905   PetscFunctionBegin;
1906   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1907   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1908 
1909   idx = a->j;
1910   v   = a->a;
1911   if (usecprow) {
1912     if (zz != yy) {
1913       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1914     }
1915     mbs  = a->compressedrow.nrows;
1916     ii   = a->compressedrow.i;
1917     ridx = a->compressedrow.rindex;
1918   } else {
1919     ii = a->i;
1920     y  = yarray;
1921     z  = zarray;
1922   }
1923 
1924   for (i=0; i<mbs; i++) {
1925     n = ii[1] - ii[0]; ii++;
1926     if (usecprow) {
1927       z = zarray + 5*ridx[i];
1928       y = yarray + 5*ridx[i];
1929     }
1930     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1931     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1932     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1933     for (j=0; j<n; j++) {
1934       xb    = x + 5*(*idx++);
1935       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1936       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1937       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1938       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1939       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1940       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1941       v    += 25;
1942     }
1943     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1944     if (!usecprow) {
1945       z += 5; y += 5;
1946     }
1947   }
1948   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1949   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1950   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1951   PetscFunctionReturn(0);
1952 }
1953 #undef __FUNCT__
1954 #define __FUNCT__ "MatMultAdd_SeqBAIJ_6"
1955 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1956 {
1957   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1958   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1959   const PetscScalar *x,*xb;
1960   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1961   const MatScalar   *v;
1962   PetscErrorCode    ierr;
1963   PetscInt          mbs = a->mbs,i,j,n;
1964   const PetscInt    *idx,*ii,*ridx=NULL;
1965   PetscBool         usecprow=a->compressedrow.use;
1966 
1967   PetscFunctionBegin;
1968   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1969   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1970 
1971   idx = a->j;
1972   v   = a->a;
1973   if (usecprow) {
1974     if (zz != yy) {
1975       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1976     }
1977     mbs  = a->compressedrow.nrows;
1978     ii   = a->compressedrow.i;
1979     ridx = a->compressedrow.rindex;
1980   } else {
1981     ii = a->i;
1982     y  = yarray;
1983     z  = zarray;
1984   }
1985 
1986   for (i=0; i<mbs; i++) {
1987     n = ii[1] - ii[0]; ii++;
1988     if (usecprow) {
1989       z = zarray + 6*ridx[i];
1990       y = yarray + 6*ridx[i];
1991     }
1992     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1993     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1994     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1995     for (j=0; j<n; j++) {
1996       xb    = x + 6*(*idx++);
1997       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1998       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1999       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2000       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2001       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2002       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2003       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2004       v    += 36;
2005     }
2006     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2007     if (!usecprow) {
2008       z += 6; y += 6;
2009     }
2010   }
2011   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2012   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2013   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
2014   PetscFunctionReturn(0);
2015 }
2016 
2017 #undef __FUNCT__
2018 #define __FUNCT__ "MatMultAdd_SeqBAIJ_7"
2019 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2020 {
2021   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2022   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2023   const PetscScalar *x,*xb;
2024   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2025   const MatScalar   *v;
2026   PetscErrorCode    ierr;
2027   PetscInt          mbs = a->mbs,i,j,n;
2028   const PetscInt    *idx,*ii,*ridx = NULL;
2029   PetscBool         usecprow=a->compressedrow.use;
2030 
2031   PetscFunctionBegin;
2032   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2033   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2034 
2035   idx = a->j;
2036   v   = a->a;
2037   if (usecprow) {
2038     if (zz != yy) {
2039       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2040     }
2041     mbs  = a->compressedrow.nrows;
2042     ii   = a->compressedrow.i;
2043     ridx = a->compressedrow.rindex;
2044   } else {
2045     ii = a->i;
2046     y  = yarray;
2047     z  = zarray;
2048   }
2049 
2050   for (i=0; i<mbs; i++) {
2051     n = ii[1] - ii[0]; ii++;
2052     if (usecprow) {
2053       z = zarray + 7*ridx[i];
2054       y = yarray + 7*ridx[i];
2055     }
2056     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2057     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2058     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2059     for (j=0; j<n; j++) {
2060       xb    = x + 7*(*idx++);
2061       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2062       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2063       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2064       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2065       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2066       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2067       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2068       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2069       v    += 49;
2070     }
2071     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2072     if (!usecprow) {
2073       z += 7; y += 7;
2074     }
2075   }
2076   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2077   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2078   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
2079   PetscFunctionReturn(0);
2080 }
2081 
2082 #undef __FUNCT__
2083 #define __FUNCT__ "MatMultAdd_SeqBAIJ_N"
2084 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2085 {
2086   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2087   PetscScalar       *z = 0,*work,*workt,*zarray;
2088   const PetscScalar *x,*xb;
2089   const MatScalar   *v;
2090   PetscErrorCode    ierr;
2091   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2092   PetscInt          ncols,k;
2093   const PetscInt    *ridx = NULL,*idx,*ii;
2094   PetscBool         usecprow = a->compressedrow.use;
2095 
2096   PetscFunctionBegin;
2097   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2098   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2099   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2100 
2101   idx = a->j;
2102   v   = a->a;
2103   if (usecprow) {
2104     mbs  = a->compressedrow.nrows;
2105     ii   = a->compressedrow.i;
2106     ridx = a->compressedrow.rindex;
2107   } else {
2108     mbs = a->mbs;
2109     ii  = a->i;
2110     z   = zarray;
2111   }
2112 
2113   if (!a->mult_work) {
2114     k    = PetscMax(A->rmap->n,A->cmap->n);
2115     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2116   }
2117   work = a->mult_work;
2118   for (i=0; i<mbs; i++) {
2119     n     = ii[1] - ii[0]; ii++;
2120     ncols = n*bs;
2121     workt = work;
2122     for (j=0; j<n; j++) {
2123       xb = x + bs*(*idx++);
2124       for (k=0; k<bs; k++) workt[k] = xb[k];
2125       workt += bs;
2126     }
2127     if (usecprow) z = zarray + bs*ridx[i];
2128     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2129     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
2130     v += n*bs2;
2131     if (!usecprow) z += bs;
2132   }
2133   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2134   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2135   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
2136   PetscFunctionReturn(0);
2137 }
2138 
2139 #undef __FUNCT__
2140 #define __FUNCT__ "MatMultHermitianTranspose_SeqBAIJ"
2141 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2142 {
2143   PetscScalar    zero = 0.0;
2144   PetscErrorCode ierr;
2145 
2146   PetscFunctionBegin;
2147   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2148   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2149   PetscFunctionReturn(0);
2150 }
2151 
2152 #undef __FUNCT__
2153 #define __FUNCT__ "MatMultTranspose_SeqBAIJ"
2154 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2155 {
2156   PetscScalar    zero = 0.0;
2157   PetscErrorCode ierr;
2158 
2159   PetscFunctionBegin;
2160   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2161   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2162   PetscFunctionReturn(0);
2163 }
2164 
2165 #undef __FUNCT__
2166 #define __FUNCT__ "MatMultHermitianTransposeAdd_SeqBAIJ"
2167 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2168 {
2169   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2170   PetscScalar       *z,x1,x2,x3,x4,x5;
2171   const PetscScalar *x,*xb = NULL;
2172   const MatScalar   *v;
2173   PetscErrorCode    ierr;
2174   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2175   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2176   Mat_CompressedRow cprow = a->compressedrow;
2177   PetscBool         usecprow = cprow.use;
2178 
2179   PetscFunctionBegin;
2180   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2181   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2182   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2183 
2184   idx = a->j;
2185   v   = a->a;
2186   if (usecprow) {
2187     mbs  = cprow.nrows;
2188     ii   = cprow.i;
2189     ridx = cprow.rindex;
2190   } else {
2191     mbs=a->mbs;
2192     ii = a->i;
2193     xb = x;
2194   }
2195 
2196   switch (bs) {
2197   case 1:
2198     for (i=0; i<mbs; i++) {
2199       if (usecprow) xb = x + ridx[i];
2200       x1 = xb[0];
2201       ib = idx + ii[0];
2202       n  = ii[1] - ii[0]; ii++;
2203       for (j=0; j<n; j++) {
2204         rval     = ib[j];
2205         z[rval] += PetscConj(*v) * x1;
2206         v++;
2207       }
2208       if (!usecprow) xb++;
2209     }
2210     break;
2211   case 2:
2212     for (i=0; i<mbs; i++) {
2213       if (usecprow) xb = x + 2*ridx[i];
2214       x1 = xb[0]; x2 = xb[1];
2215       ib = idx + ii[0];
2216       n  = ii[1] - ii[0]; ii++;
2217       for (j=0; j<n; j++) {
2218         rval       = ib[j]*2;
2219         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2220         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2221         v         += 4;
2222       }
2223       if (!usecprow) xb += 2;
2224     }
2225     break;
2226   case 3:
2227     for (i=0; i<mbs; i++) {
2228       if (usecprow) xb = x + 3*ridx[i];
2229       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2230       ib = idx + ii[0];
2231       n  = ii[1] - ii[0]; ii++;
2232       for (j=0; j<n; j++) {
2233         rval       = ib[j]*3;
2234         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2235         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2236         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2237         v         += 9;
2238       }
2239       if (!usecprow) xb += 3;
2240     }
2241     break;
2242   case 4:
2243     for (i=0; i<mbs; i++) {
2244       if (usecprow) xb = x + 4*ridx[i];
2245       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2246       ib = idx + ii[0];
2247       n  = ii[1] - ii[0]; ii++;
2248       for (j=0; j<n; j++) {
2249         rval       = ib[j]*4;
2250         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2251         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2252         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2253         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2254         v         += 16;
2255       }
2256       if (!usecprow) xb += 4;
2257     }
2258     break;
2259   case 5:
2260     for (i=0; i<mbs; i++) {
2261       if (usecprow) xb = x + 5*ridx[i];
2262       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2263       x4 = xb[3]; x5 = xb[4];
2264       ib = idx + ii[0];
2265       n  = ii[1] - ii[0]; ii++;
2266       for (j=0; j<n; j++) {
2267         rval       = ib[j]*5;
2268         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2269         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2270         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2271         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2272         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2273         v         += 25;
2274       }
2275       if (!usecprow) xb += 5;
2276     }
2277     break;
2278   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2279     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2280 #if 0
2281     {
2282       PetscInt          ncols,k,bs2=a->bs2;
2283       PetscScalar       *work,*workt,zb;
2284       const PetscScalar *xtmp;
2285       if (!a->mult_work) {
2286         k    = PetscMax(A->rmap->n,A->cmap->n);
2287         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2288       }
2289       work = a->mult_work;
2290       xtmp = x;
2291       for (i=0; i<mbs; i++) {
2292         n     = ii[1] - ii[0]; ii++;
2293         ncols = n*bs;
2294         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2295         if (usecprow) xtmp = x + bs*ridx[i];
2296         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2297         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2298         v += n*bs2;
2299         if (!usecprow) xtmp += bs;
2300         workt = work;
2301         for (j=0; j<n; j++) {
2302           zb = z + bs*(*idx++);
2303           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2304           workt += bs;
2305         }
2306       }
2307     }
2308 #endif
2309   }
2310   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2311   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2312   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2313   PetscFunctionReturn(0);
2314 }
2315 
2316 #undef __FUNCT__
2317 #define __FUNCT__ "MatMultTransposeAdd_SeqBAIJ"
2318 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2319 {
2320   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2321   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2322   const PetscScalar *x,*xb = 0;
2323   const MatScalar   *v;
2324   PetscErrorCode    ierr;
2325   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2326   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2327   Mat_CompressedRow cprow   = a->compressedrow;
2328   PetscBool         usecprow=cprow.use;
2329 
2330   PetscFunctionBegin;
2331   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2332   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2333   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2334 
2335   idx = a->j;
2336   v   = a->a;
2337   if (usecprow) {
2338     mbs  = cprow.nrows;
2339     ii   = cprow.i;
2340     ridx = cprow.rindex;
2341   } else {
2342     mbs=a->mbs;
2343     ii = a->i;
2344     xb = x;
2345   }
2346 
2347   switch (bs) {
2348   case 1:
2349     for (i=0; i<mbs; i++) {
2350       if (usecprow) xb = x + ridx[i];
2351       x1 = xb[0];
2352       ib = idx + ii[0];
2353       n  = ii[1] - ii[0]; ii++;
2354       for (j=0; j<n; j++) {
2355         rval     = ib[j];
2356         z[rval] += *v * x1;
2357         v++;
2358       }
2359       if (!usecprow) xb++;
2360     }
2361     break;
2362   case 2:
2363     for (i=0; i<mbs; i++) {
2364       if (usecprow) xb = x + 2*ridx[i];
2365       x1 = xb[0]; x2 = xb[1];
2366       ib = idx + ii[0];
2367       n  = ii[1] - ii[0]; ii++;
2368       for (j=0; j<n; j++) {
2369         rval       = ib[j]*2;
2370         z[rval++] += v[0]*x1 + v[1]*x2;
2371         z[rval++] += v[2]*x1 + v[3]*x2;
2372         v         += 4;
2373       }
2374       if (!usecprow) xb += 2;
2375     }
2376     break;
2377   case 3:
2378     for (i=0; i<mbs; i++) {
2379       if (usecprow) xb = x + 3*ridx[i];
2380       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2381       ib = idx + ii[0];
2382       n  = ii[1] - ii[0]; ii++;
2383       for (j=0; j<n; j++) {
2384         rval       = ib[j]*3;
2385         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2386         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2387         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2388         v         += 9;
2389       }
2390       if (!usecprow) xb += 3;
2391     }
2392     break;
2393   case 4:
2394     for (i=0; i<mbs; i++) {
2395       if (usecprow) xb = x + 4*ridx[i];
2396       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2397       ib = idx + ii[0];
2398       n  = ii[1] - ii[0]; ii++;
2399       for (j=0; j<n; j++) {
2400         rval       = ib[j]*4;
2401         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2402         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2403         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2404         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2405         v         += 16;
2406       }
2407       if (!usecprow) xb += 4;
2408     }
2409     break;
2410   case 5:
2411     for (i=0; i<mbs; i++) {
2412       if (usecprow) xb = x + 5*ridx[i];
2413       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2414       x4 = xb[3]; x5 = xb[4];
2415       ib = idx + ii[0];
2416       n  = ii[1] - ii[0]; ii++;
2417       for (j=0; j<n; j++) {
2418         rval       = ib[j]*5;
2419         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2420         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2421         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2422         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2423         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2424         v         += 25;
2425       }
2426       if (!usecprow) xb += 5;
2427     }
2428     break;
2429   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2430     PetscInt          ncols,k;
2431     PetscScalar       *work,*workt;
2432     const PetscScalar *xtmp;
2433     if (!a->mult_work) {
2434       k    = PetscMax(A->rmap->n,A->cmap->n);
2435       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2436     }
2437     work = a->mult_work;
2438     xtmp = x;
2439     for (i=0; i<mbs; i++) {
2440       n     = ii[1] - ii[0]; ii++;
2441       ncols = n*bs;
2442       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
2443       if (usecprow) xtmp = x + bs*ridx[i];
2444       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2445       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2446       v += n*bs2;
2447       if (!usecprow) xtmp += bs;
2448       workt = work;
2449       for (j=0; j<n; j++) {
2450         zb = z + bs*(*idx++);
2451         for (k=0; k<bs; k++) zb[k] += workt[k];
2452         workt += bs;
2453       }
2454     }
2455     }
2456   }
2457   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2458   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2459   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2460   PetscFunctionReturn(0);
2461 }
2462 
2463 #undef __FUNCT__
2464 #define __FUNCT__ "MatScale_SeqBAIJ"
2465 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2466 {
2467   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2468   PetscInt       totalnz = a->bs2*a->nz;
2469   PetscScalar    oalpha  = alpha;
2470   PetscErrorCode ierr;
2471   PetscBLASInt   one = 1,tnz;
2472 
2473   PetscFunctionBegin;
2474   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2475   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2476   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2477   PetscFunctionReturn(0);
2478 }
2479 
2480 #undef __FUNCT__
2481 #define __FUNCT__ "MatNorm_SeqBAIJ"
2482 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2483 {
2484   PetscErrorCode ierr;
2485   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2486   MatScalar      *v  = a->a;
2487   PetscReal      sum = 0.0;
2488   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2489 
2490   PetscFunctionBegin;
2491   if (type == NORM_FROBENIUS) {
2492     for (i=0; i< bs2*nz; i++) {
2493       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2494     }
2495     *norm = PetscSqrtReal(sum);
2496   } else if (type == NORM_1) { /* maximum column sum */
2497     PetscReal *tmp;
2498     PetscInt  *bcol = a->j;
2499     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2500     for (i=0; i<nz; i++) {
2501       for (j=0; j<bs; j++) {
2502         k1 = bs*(*bcol) + j; /* column index */
2503         for (k=0; k<bs; k++) {
2504           tmp[k1] += PetscAbsScalar(*v); v++;
2505         }
2506       }
2507       bcol++;
2508     }
2509     *norm = 0.0;
2510     for (j=0; j<A->cmap->n; j++) {
2511       if (tmp[j] > *norm) *norm = tmp[j];
2512     }
2513     ierr = PetscFree(tmp);CHKERRQ(ierr);
2514   } else if (type == NORM_INFINITY) { /* maximum row sum */
2515     *norm = 0.0;
2516     for (k=0; k<bs; k++) {
2517       for (j=0; j<a->mbs; j++) {
2518         v   = a->a + bs2*a->i[j] + k;
2519         sum = 0.0;
2520         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2521           for (k1=0; k1<bs; k1++) {
2522             sum += PetscAbsScalar(*v);
2523             v   += bs;
2524           }
2525         }
2526         if (sum > *norm) *norm = sum;
2527       }
2528     }
2529   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2530   PetscFunctionReturn(0);
2531 }
2532 
2533 
2534 #undef __FUNCT__
2535 #define __FUNCT__ "MatEqual_SeqBAIJ"
2536 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2537 {
2538   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2539   PetscErrorCode ierr;
2540 
2541   PetscFunctionBegin;
2542   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2543   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2544     *flg = PETSC_FALSE;
2545     PetscFunctionReturn(0);
2546   }
2547 
2548   /* if the a->i are the same */
2549   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2550   if (!*flg) PetscFunctionReturn(0);
2551 
2552   /* if a->j are the same */
2553   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2554   if (!*flg) PetscFunctionReturn(0);
2555 
2556   /* if a->a are the same */
2557   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2558   PetscFunctionReturn(0);
2559 
2560 }
2561 
2562 #undef __FUNCT__
2563 #define __FUNCT__ "MatGetDiagonal_SeqBAIJ"
2564 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2565 {
2566   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2567   PetscErrorCode ierr;
2568   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2569   PetscScalar    *x,zero = 0.0;
2570   MatScalar      *aa,*aa_j;
2571 
2572   PetscFunctionBegin;
2573   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2574   bs   = A->rmap->bs;
2575   aa   = a->a;
2576   ai   = a->i;
2577   aj   = a->j;
2578   ambs = a->mbs;
2579   bs2  = a->bs2;
2580 
2581   ierr = VecSet(v,zero);CHKERRQ(ierr);
2582   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2583   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2584   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2585   for (i=0; i<ambs; i++) {
2586     for (j=ai[i]; j<ai[i+1]; j++) {
2587       if (aj[j] == i) {
2588         row  = i*bs;
2589         aa_j = aa+j*bs2;
2590         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2591         break;
2592       }
2593     }
2594   }
2595   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2596   PetscFunctionReturn(0);
2597 }
2598 
2599 #undef __FUNCT__
2600 #define __FUNCT__ "MatDiagonalScale_SeqBAIJ"
2601 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2602 {
2603   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2604   const PetscScalar *l,*r,*li,*ri;
2605   PetscScalar       x;
2606   MatScalar         *aa, *v;
2607   PetscErrorCode    ierr;
2608   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2609   const PetscInt    *ai,*aj;
2610 
2611   PetscFunctionBegin;
2612   ai  = a->i;
2613   aj  = a->j;
2614   aa  = a->a;
2615   m   = A->rmap->n;
2616   n   = A->cmap->n;
2617   bs  = A->rmap->bs;
2618   mbs = a->mbs;
2619   bs2 = a->bs2;
2620   if (ll) {
2621     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2622     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2623     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2624     for (i=0; i<mbs; i++) { /* for each block row */
2625       M  = ai[i+1] - ai[i];
2626       li = l + i*bs;
2627       v  = aa + bs2*ai[i];
2628       for (j=0; j<M; j++) { /* for each block */
2629         for (k=0; k<bs2; k++) {
2630           (*v++) *= li[k%bs];
2631         }
2632       }
2633     }
2634     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2635     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2636   }
2637 
2638   if (rr) {
2639     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2640     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2641     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2642     for (i=0; i<mbs; i++) { /* for each block row */
2643       iai = ai[i];
2644       M   = ai[i+1] - iai;
2645       v   = aa + bs2*iai;
2646       for (j=0; j<M; j++) { /* for each block */
2647         ri = r + bs*aj[iai+j];
2648         for (k=0; k<bs; k++) {
2649           x = ri[k];
2650           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2651           v += bs;
2652         }
2653       }
2654     }
2655     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2656     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2657   }
2658   PetscFunctionReturn(0);
2659 }
2660 
2661 
2662 #undef __FUNCT__
2663 #define __FUNCT__ "MatGetInfo_SeqBAIJ"
2664 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2665 {
2666   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2667 
2668   PetscFunctionBegin;
2669   info->block_size   = a->bs2;
2670   info->nz_allocated = a->bs2*a->maxnz;
2671   info->nz_used      = a->bs2*a->nz;
2672   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2673   info->assemblies   = A->num_ass;
2674   info->mallocs      = A->info.mallocs;
2675   info->memory       = ((PetscObject)A)->mem;
2676   if (A->factortype) {
2677     info->fill_ratio_given  = A->info.fill_ratio_given;
2678     info->fill_ratio_needed = A->info.fill_ratio_needed;
2679     info->factor_mallocs    = A->info.factor_mallocs;
2680   } else {
2681     info->fill_ratio_given  = 0;
2682     info->fill_ratio_needed = 0;
2683     info->factor_mallocs    = 0;
2684   }
2685   PetscFunctionReturn(0);
2686 }
2687 
2688 
2689 #if defined(PETSC_THREADCOMM_ACTIVE)
2690 PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A)
2691 {
2692   PetscErrorCode ierr;
2693   PetscInt       *trstarts=A->rmap->trstarts;
2694   PetscInt       n,start,end;
2695   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
2696 
2697   start = trstarts[thread_id]/A->rmap->bs;
2698   end   = trstarts[thread_id+1]/A->rmap->bs;
2699   n     = a->i[end] - a->i[start];
2700   ierr  = PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
2701   return 0;
2702 }
2703 
2704 #undef __FUNCT__
2705 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2706 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2707 {
2708   PetscErrorCode ierr;
2709 
2710 
2711   PetscFunctionBegin;
2712   ierr = PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);CHKERRQ(ierr);
2713   PetscFunctionReturn(0);
2714 }
2715 #else
2716 #undef __FUNCT__
2717 #define __FUNCT__ "MatZeroEntries_SeqBAIJ"
2718 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2719 {
2720   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2721   PetscErrorCode ierr;
2722 
2723   PetscFunctionBegin;
2724   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2725   PetscFunctionReturn(0);
2726 }
2727 #endif
2728