xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 94ef8dde638caef1d0cd84a7dc8a2db65fcda8b6)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
8 {
9   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
10   PetscErrorCode ierr;
11   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
12   const PetscInt *idx;
13   PetscInt       start,end,*ai,*aj,bs,*nidx2;
14   PetscBT        table;
15 
16   PetscFunctionBegin;
17   m  = a->mbs;
18   ai = a->i;
19   aj = a->j;
20   bs = A->rmap->bs;
21 
22   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23 
24   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
25   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27 
28   for (i=0; i<is_max; i++) {
29     /* Initialise the two local arrays */
30     isz  = 0;
31     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
38     for (j=0; j<n; ++j) {
39       ival = idx[j]/bs; /* convert the indices into block indices */
40       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
42     }
43     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
44     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
45 
46     k = 0;
47     for (j=0; j<ov; j++) { /* for each overlap*/
48       n = isz;
49       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
50         row   = nidx[k];
51         start = ai[row];
52         end   = ai[row+1];
53         for (l = start; l<end; l++) {
54           val = aj[l];
55           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
56         }
57       }
58     }
59     /* expand the Index Set */
60     for (j=0; j<isz; j++) {
61       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
62     }
63     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
64   }
65   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
66   ierr = PetscFree(nidx);CHKERRQ(ierr);
67   ierr = PetscFree(nidx2);CHKERRQ(ierr);
68   PetscFunctionReturn(0);
69 }
70 
71 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
72 {
73   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
74   PetscErrorCode ierr;
75   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
76   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
77   const PetscInt *irow,*icol;
78   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
79   PetscInt       *aj = a->j,*ai = a->i;
80   MatScalar      *mat_a;
81   Mat            C;
82   PetscBool      flag;
83 
84   PetscFunctionBegin;
85   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
86   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
87   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
88   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
89 
90   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
91   ssmap = smap;
92   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
93   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
94   /* determine lens of each row */
95   for (i=0; i<nrows; i++) {
96     kstart  = ai[irow[i]];
97     kend    = kstart + a->ilen[irow[i]];
98     lens[i] = 0;
99     for (k=kstart; k<kend; k++) {
100       if (ssmap[aj[k]]) lens[i]++;
101     }
102   }
103   /* Create and fill new matrix */
104   if (scall == MAT_REUSE_MATRIX) {
105     c = (Mat_SeqBAIJ*)((*B)->data);
106 
107     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
108     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
109     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
110     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
111     C    = *B;
112   } else {
113     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
114     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
115     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
116     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
117   }
118   c = (Mat_SeqBAIJ*)(C->data);
119   for (i=0; i<nrows; i++) {
120     row      = irow[i];
121     kstart   = ai[row];
122     kend     = kstart + a->ilen[row];
123     mat_i    = c->i[i];
124     mat_j    = c->j + mat_i;
125     mat_a    = c->a + mat_i*bs2;
126     mat_ilen = c->ilen + i;
127     for (k=kstart; k<kend; k++) {
128       if ((tcol=ssmap[a->j[k]])) {
129         *mat_j++ = tcol - 1;
130         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
131         mat_a   += bs2;
132         (*mat_ilen)++;
133       }
134     }
135   }
136   /* sort */
137   {
138     MatScalar *work;
139     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
140     for (i=0; i<nrows; i++) {
141       PetscInt ilen;
142       mat_i = c->i[i];
143       mat_j = c->j + mat_i;
144       mat_a = c->a + mat_i*bs2;
145       ilen  = c->ilen[i];
146       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
147     }
148     ierr = PetscFree(work);CHKERRQ(ierr);
149   }
150 
151   /* Free work space */
152   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
153   ierr = PetscFree(smap);CHKERRQ(ierr);
154   ierr = PetscFree(lens);CHKERRQ(ierr);
155   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
157 
158   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
159   *B   = C;
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164 {
165   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166   IS             is1,is2;
167   PetscErrorCode ierr;
168   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
169   const PetscInt *irow,*icol;
170 
171   PetscFunctionBegin;
172   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
173   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
174   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
176 
177   /* Verify if the indices corespond to each element in a block
178    and form the IS with compressed IS */
179   maxmnbs = PetscMax(a->mbs,a->nbs);
180   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
181   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
182   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183   for (i=0; i<a->mbs; i++) {
184     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185   }
186   count = 0;
187   for (i=0; i<nrows; i++) {
188     j = irow[i] / bs;
189     if ((vary[j]--)==bs) iary[count++] = j;
190   }
191   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
192 
193   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
194   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
195   for (i=0; i<a->nbs; i++) {
196     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
197   }
198   count = 0;
199   for (i=0; i<ncols; i++) {
200     j = icol[i] / bs;
201     if ((vary[j]--)==bs) iary[count++] = j;
202   }
203   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
204   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
206   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
207 
208   ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
209   ierr = ISDestroy(&is1);CHKERRQ(ierr);
210   ierr = ISDestroy(&is2);CHKERRQ(ierr);
211   PetscFunctionReturn(0);
212 }
213 
214 PetscErrorCode MatDestroy_SeqBAIJ_Submatrices(Mat C)
215 {
216   PetscErrorCode ierr;
217   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
218   Mat_SubSppt    *submatj = c->submatis1;
219 
220   PetscFunctionBegin;
221   ierr = submatj->destroy(C);CHKERRQ(ierr);
222   ierr = MatDestroySubMatrices_Private(submatj);CHKERRQ(ierr);
223   PetscFunctionReturn(0);
224 }
225 
226 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
227 {
228   PetscErrorCode ierr;
229   PetscInt       i;
230 
231   PetscFunctionBegin;
232   if (scall == MAT_INITIAL_MATRIX) {
233     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
234   }
235 
236   for (i=0; i<n; i++) {
237     ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
238   }
239   PetscFunctionReturn(0);
240 }
241 
242 
243 /* -------------------------------------------------------*/
244 /* Should check that shapes of vectors and matrices match */
245 /* -------------------------------------------------------*/
246 
247 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
248 {
249   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
250   PetscScalar       *z,sum;
251   const PetscScalar *x;
252   const MatScalar   *v;
253   PetscErrorCode    ierr;
254   PetscInt          mbs,i,n;
255   const PetscInt    *idx,*ii,*ridx=NULL;
256   PetscBool         usecprow=a->compressedrow.use;
257 
258   PetscFunctionBegin;
259   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
260   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
261 
262   if (usecprow) {
263     mbs  = a->compressedrow.nrows;
264     ii   = a->compressedrow.i;
265     ridx = a->compressedrow.rindex;
266     ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
267   } else {
268     mbs = a->mbs;
269     ii  = a->i;
270   }
271 
272   for (i=0; i<mbs; i++) {
273     n   = ii[1] - ii[0];
274     v   = a->a + ii[0];
275     idx = a->j + ii[0];
276     ii++;
277     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
278     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
279     sum = 0.0;
280     PetscSparseDensePlusDot(sum,x,v,idx,n);
281     if (usecprow) {
282       z[ridx[i]] = sum;
283     } else {
284       z[i]       = sum;
285     }
286   }
287   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
288   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
289   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
290   PetscFunctionReturn(0);
291 }
292 
293 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
294 {
295   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
296   PetscScalar       *z = 0,sum1,sum2,*zarray;
297   const PetscScalar *x,*xb;
298   PetscScalar       x1,x2;
299   const MatScalar   *v;
300   PetscErrorCode    ierr;
301   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
302   PetscBool         usecprow=a->compressedrow.use;
303 
304   PetscFunctionBegin;
305   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
306   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
307 
308   idx = a->j;
309   v   = a->a;
310   if (usecprow) {
311     mbs  = a->compressedrow.nrows;
312     ii   = a->compressedrow.i;
313     ridx = a->compressedrow.rindex;
314     ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
315   } else {
316     mbs = a->mbs;
317     ii  = a->i;
318     z   = zarray;
319   }
320 
321   for (i=0; i<mbs; i++) {
322     n           = ii[1] - ii[0]; ii++;
323     sum1        = 0.0; sum2 = 0.0;
324     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
325     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
326     for (j=0; j<n; j++) {
327       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
328       sum1 += v[0]*x1 + v[2]*x2;
329       sum2 += v[1]*x1 + v[3]*x2;
330       v    += 4;
331     }
332     if (usecprow) z = zarray + 2*ridx[i];
333     z[0] = sum1; z[1] = sum2;
334     if (!usecprow) z += 2;
335   }
336   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
337   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
338   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
339   PetscFunctionReturn(0);
340 }
341 
342 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
343 {
344   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
345   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
346   const PetscScalar *x,*xb;
347   const MatScalar   *v;
348   PetscErrorCode    ierr;
349   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
350   PetscBool         usecprow=a->compressedrow.use;
351 
352 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
353 #pragma disjoint(*v,*z,*xb)
354 #endif
355 
356   PetscFunctionBegin;
357   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
358   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
359 
360   idx = a->j;
361   v   = a->a;
362   if (usecprow) {
363     mbs  = a->compressedrow.nrows;
364     ii   = a->compressedrow.i;
365     ridx = a->compressedrow.rindex;
366     ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
367   } else {
368     mbs = a->mbs;
369     ii  = a->i;
370     z   = zarray;
371   }
372 
373   for (i=0; i<mbs; i++) {
374     n           = ii[1] - ii[0]; ii++;
375     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
376     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
377     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
378     for (j=0; j<n; j++) {
379       xb = x + 3*(*idx++);
380       x1 = xb[0];
381       x2 = xb[1];
382       x3 = xb[2];
383 
384       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
385       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
386       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
387       v    += 9;
388     }
389     if (usecprow) z = zarray + 3*ridx[i];
390     z[0] = sum1; z[1] = sum2; z[2] = sum3;
391     if (!usecprow) z += 3;
392   }
393   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
394   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
395   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
396   PetscFunctionReturn(0);
397 }
398 
399 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
400 {
401   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
402   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
403   const PetscScalar *x,*xb;
404   const MatScalar   *v;
405   PetscErrorCode    ierr;
406   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
407   PetscBool         usecprow=a->compressedrow.use;
408 
409   PetscFunctionBegin;
410   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
411   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
412 
413   idx = a->j;
414   v   = a->a;
415   if (usecprow) {
416     mbs  = a->compressedrow.nrows;
417     ii   = a->compressedrow.i;
418     ridx = a->compressedrow.rindex;
419     ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
420   } else {
421     mbs = a->mbs;
422     ii  = a->i;
423     z   = zarray;
424   }
425 
426   for (i=0; i<mbs; i++) {
427     n = ii[1] - ii[0];
428     ii++;
429     sum1 = 0.0;
430     sum2 = 0.0;
431     sum3 = 0.0;
432     sum4 = 0.0;
433 
434     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
435     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
436     for (j=0; j<n; j++) {
437       xb    = x + 4*(*idx++);
438       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
439       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
440       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
441       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
442       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
443       v    += 16;
444     }
445     if (usecprow) z = zarray + 4*ridx[i];
446     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
447     if (!usecprow) z += 4;
448   }
449   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
450   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
451   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
452   PetscFunctionReturn(0);
453 }
454 
455 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
456 {
457   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
458   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
459   const PetscScalar *xb,*x;
460   const MatScalar   *v;
461   PetscErrorCode    ierr;
462   const PetscInt    *idx,*ii,*ridx=NULL;
463   PetscInt          mbs,i,j,n;
464   PetscBool         usecprow=a->compressedrow.use;
465 
466   PetscFunctionBegin;
467   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
468   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
469 
470   idx = a->j;
471   v   = a->a;
472   if (usecprow) {
473     mbs  = a->compressedrow.nrows;
474     ii   = a->compressedrow.i;
475     ridx = a->compressedrow.rindex;
476     ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
477   } else {
478     mbs = a->mbs;
479     ii  = a->i;
480     z   = zarray;
481   }
482 
483   for (i=0; i<mbs; i++) {
484     n           = ii[1] - ii[0]; ii++;
485     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
486     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
487     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
488     for (j=0; j<n; j++) {
489       xb    = x + 5*(*idx++);
490       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
491       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
492       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
493       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
494       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
495       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
496       v    += 25;
497     }
498     if (usecprow) z = zarray + 5*ridx[i];
499     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
500     if (!usecprow) z += 5;
501   }
502   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
503   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
504   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
505   PetscFunctionReturn(0);
506 }
507 
508 
509 
510 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
511 {
512   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
513   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
514   const PetscScalar *x,*xb;
515   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
516   const MatScalar   *v;
517   PetscErrorCode    ierr;
518   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
519   PetscBool         usecprow=a->compressedrow.use;
520 
521   PetscFunctionBegin;
522   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
523   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
524 
525   idx = a->j;
526   v   = a->a;
527   if (usecprow) {
528     mbs  = a->compressedrow.nrows;
529     ii   = a->compressedrow.i;
530     ridx = a->compressedrow.rindex;
531     ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
532   } else {
533     mbs = a->mbs;
534     ii  = a->i;
535     z   = zarray;
536   }
537 
538   for (i=0; i<mbs; i++) {
539     n  = ii[1] - ii[0];
540     ii++;
541     sum1 = 0.0;
542     sum2 = 0.0;
543     sum3 = 0.0;
544     sum4 = 0.0;
545     sum5 = 0.0;
546     sum6 = 0.0;
547 
548     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
549     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
550     for (j=0; j<n; j++) {
551       xb    = x + 6*(*idx++);
552       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
553       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
554       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
555       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
556       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
557       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
558       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
559       v    += 36;
560     }
561     if (usecprow) z = zarray + 6*ridx[i];
562     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
563     if (!usecprow) z += 6;
564   }
565 
566   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
567   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
568   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
569   PetscFunctionReturn(0);
570 }
571 
572 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
573 {
574   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
575   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
576   const PetscScalar *x,*xb;
577   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
578   const MatScalar   *v;
579   PetscErrorCode    ierr;
580   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
581   PetscBool         usecprow=a->compressedrow.use;
582 
583   PetscFunctionBegin;
584   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
585   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
586 
587   idx = a->j;
588   v   = a->a;
589   if (usecprow) {
590     mbs  = a->compressedrow.nrows;
591     ii   = a->compressedrow.i;
592     ridx = a->compressedrow.rindex;
593     ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
594   } else {
595     mbs = a->mbs;
596     ii  = a->i;
597     z   = zarray;
598   }
599 
600   for (i=0; i<mbs; i++) {
601     n  = ii[1] - ii[0];
602     ii++;
603     sum1 = 0.0;
604     sum2 = 0.0;
605     sum3 = 0.0;
606     sum4 = 0.0;
607     sum5 = 0.0;
608     sum6 = 0.0;
609     sum7 = 0.0;
610 
611     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
612     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
613     for (j=0; j<n; j++) {
614       xb    = x + 7*(*idx++);
615       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
616       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
617       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
618       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
619       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
620       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
621       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
622       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
623       v    += 49;
624     }
625     if (usecprow) z = zarray + 7*ridx[i];
626     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
627     if (!usecprow) z += 7;
628   }
629 
630   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
631   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
632   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
633   PetscFunctionReturn(0);
634 }
635 
636 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
637 {
638   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
639   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
640   const PetscScalar *x,*xb;
641   PetscScalar       *zarray,xv;
642   const MatScalar   *v;
643   PetscErrorCode    ierr;
644   const PetscInt    *ii,*ij=a->j,*idx;
645   PetscInt          mbs,i,j,k,n,*ridx=NULL;
646   PetscBool         usecprow=a->compressedrow.use;
647 
648   PetscFunctionBegin;
649   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
650   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
651 
652   v = a->a;
653   if (usecprow) {
654     mbs  = a->compressedrow.nrows;
655     ii   = a->compressedrow.i;
656     ridx = a->compressedrow.rindex;
657     ierr = PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
658   } else {
659     mbs = a->mbs;
660     ii  = a->i;
661     z   = zarray;
662   }
663 
664   for (i=0; i<mbs; i++) {
665     n    = ii[i+1] - ii[i];
666     idx  = ij + ii[i];
667     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
668     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
669 
670     for (j=0; j<n; j++) {
671       xb = x + 11*(idx[j]);
672 
673       for (k=0; k<11; k++) {
674         xv     =  xb[k];
675         sum1  += v[0]*xv;
676         sum2  += v[1]*xv;
677         sum3  += v[2]*xv;
678         sum4  += v[3]*xv;
679         sum5  += v[4]*xv;
680         sum6  += v[5]*xv;
681         sum7  += v[6]*xv;
682         sum8  += v[7]*xv;
683         sum9  += v[8]*xv;
684         sum10 += v[9]*xv;
685         sum11 += v[10]*xv;
686         v     += 11;
687       }
688     }
689     if (usecprow) z = zarray + 11*ridx[i];
690     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
691     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
692 
693     if (!usecprow) z += 11;
694   }
695 
696   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
697   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
698   ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
703 /* Default MatMult for block size 15 */
704 
705 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
706 {
707   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
708   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
709   const PetscScalar *x,*xb;
710   PetscScalar       *zarray,xv;
711   const MatScalar   *v;
712   PetscErrorCode    ierr;
713   const PetscInt    *ii,*ij=a->j,*idx;
714   PetscInt          mbs,i,j,k,n,*ridx=NULL;
715   PetscBool         usecprow=a->compressedrow.use;
716 
717   PetscFunctionBegin;
718   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
719   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
720 
721   v = a->a;
722   if (usecprow) {
723     mbs  = a->compressedrow.nrows;
724     ii   = a->compressedrow.i;
725     ridx = a->compressedrow.rindex;
726     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
727   } else {
728     mbs = a->mbs;
729     ii  = a->i;
730     z   = zarray;
731   }
732 
733   for (i=0; i<mbs; i++) {
734     n    = ii[i+1] - ii[i];
735     idx  = ij + ii[i];
736     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
737     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
738 
739     for (j=0; j<n; j++) {
740       xb = x + 15*(idx[j]);
741 
742       for (k=0; k<15; k++) {
743         xv     =  xb[k];
744         sum1  += v[0]*xv;
745         sum2  += v[1]*xv;
746         sum3  += v[2]*xv;
747         sum4  += v[3]*xv;
748         sum5  += v[4]*xv;
749         sum6  += v[5]*xv;
750         sum7  += v[6]*xv;
751         sum8  += v[7]*xv;
752         sum9  += v[8]*xv;
753         sum10 += v[9]*xv;
754         sum11 += v[10]*xv;
755         sum12 += v[11]*xv;
756         sum13 += v[12]*xv;
757         sum14 += v[13]*xv;
758         sum15 += v[14]*xv;
759         v     += 15;
760       }
761     }
762     if (usecprow) z = zarray + 15*ridx[i];
763     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
764     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
765 
766     if (!usecprow) z += 15;
767   }
768 
769   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
771   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 
775 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
776 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
777 {
778   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
779   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
780   const PetscScalar *x,*xb;
781   PetscScalar       x1,x2,x3,x4,*zarray;
782   const MatScalar   *v;
783   PetscErrorCode    ierr;
784   const PetscInt    *ii,*ij=a->j,*idx;
785   PetscInt          mbs,i,j,n,*ridx=NULL;
786   PetscBool         usecprow=a->compressedrow.use;
787 
788   PetscFunctionBegin;
789   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
790   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
791 
792   v = a->a;
793   if (usecprow) {
794     mbs  = a->compressedrow.nrows;
795     ii   = a->compressedrow.i;
796     ridx = a->compressedrow.rindex;
797     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
798   } else {
799     mbs = a->mbs;
800     ii  = a->i;
801     z   = zarray;
802   }
803 
804   for (i=0; i<mbs; i++) {
805     n    = ii[i+1] - ii[i];
806     idx  = ij + ii[i];
807     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
808     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
809 
810     for (j=0; j<n; j++) {
811       xb = x + 15*(idx[j]);
812       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
813 
814       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
815       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
816       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
817       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
818       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
819       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
820       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
821       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
822       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
823       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
824       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
825       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
826       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
827       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
828       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
829 
830       v += 60;
831 
832       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
833 
834       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
835       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
836       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
837       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
838       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
839       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
840       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
841       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
842       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
843       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
844       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
845       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
846       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
847       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
848       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
849       v     += 60;
850 
851       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
852       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
853       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
854       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
855       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
856       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
857       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
858       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
859       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
860       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
861       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
862       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
863       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
864       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
865       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
866       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
867       v     += 60;
868 
869       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
870       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
871       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
872       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
873       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
874       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
875       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
876       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
877       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
878       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
879       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
880       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
881       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
882       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
883       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
884       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
885       v     += 45;
886     }
887     if (usecprow) z = zarray + 15*ridx[i];
888     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
889     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
890 
891     if (!usecprow) z += 15;
892   }
893 
894   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
895   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
896   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
897   PetscFunctionReturn(0);
898 }
899 
900 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
901 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
902 {
903   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
904   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
905   const PetscScalar *x,*xb;
906   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
907   const MatScalar   *v;
908   PetscErrorCode    ierr;
909   const PetscInt    *ii,*ij=a->j,*idx;
910   PetscInt          mbs,i,j,n,*ridx=NULL;
911   PetscBool         usecprow=a->compressedrow.use;
912 
913   PetscFunctionBegin;
914   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
915   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
916 
917   v = a->a;
918   if (usecprow) {
919     mbs  = a->compressedrow.nrows;
920     ii   = a->compressedrow.i;
921     ridx = a->compressedrow.rindex;
922     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
923   } else {
924     mbs = a->mbs;
925     ii  = a->i;
926     z   = zarray;
927   }
928 
929   for (i=0; i<mbs; i++) {
930     n    = ii[i+1] - ii[i];
931     idx  = ij + ii[i];
932     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
933     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
934 
935     for (j=0; j<n; j++) {
936       xb = x + 15*(idx[j]);
937       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
938       x8 = xb[7];
939 
940       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
941       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
942       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
943       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
944       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
945       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
946       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
947       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
948       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
949       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
950       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
951       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
952       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
953       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
954       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
955       v     += 120;
956 
957       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
958 
959       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
960       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
961       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
962       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
963       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
964       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
965       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
966       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
967       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
968       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
969       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
970       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
971       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
972       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
973       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
974       v     += 105;
975     }
976     if (usecprow) z = zarray + 15*ridx[i];
977     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
978     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
979 
980     if (!usecprow) z += 15;
981   }
982 
983   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
984   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
985   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
986   PetscFunctionReturn(0);
987 }
988 
989 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
990 
991 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
992 {
993   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
994   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
995   const PetscScalar *x,*xb;
996   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
997   const MatScalar   *v;
998   PetscErrorCode    ierr;
999   const PetscInt    *ii,*ij=a->j,*idx;
1000   PetscInt          mbs,i,j,n,*ridx=NULL;
1001   PetscBool         usecprow=a->compressedrow.use;
1002 
1003   PetscFunctionBegin;
1004   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1005   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1006 
1007   v = a->a;
1008   if (usecprow) {
1009     mbs  = a->compressedrow.nrows;
1010     ii   = a->compressedrow.i;
1011     ridx = a->compressedrow.rindex;
1012     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1013   } else {
1014     mbs = a->mbs;
1015     ii  = a->i;
1016     z   = zarray;
1017   }
1018 
1019   for (i=0; i<mbs; i++) {
1020     n    = ii[i+1] - ii[i];
1021     idx  = ij + ii[i];
1022     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1023     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1024 
1025     for (j=0; j<n; j++) {
1026       xb = x + 15*(idx[j]);
1027       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1028       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1029 
1030       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1031       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1032       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1033       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1034       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1035       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1036       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1037       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1038       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1039       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1040       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1041       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1042       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1043       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1044       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1045       v     += 225;
1046     }
1047     if (usecprow) z = zarray + 15*ridx[i];
1048     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1049     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1050 
1051     if (!usecprow) z += 15;
1052   }
1053 
1054   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1055   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1056   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1057   PetscFunctionReturn(0);
1058 }
1059 
1060 
1061 /*
1062     This will not work with MatScalar == float because it calls the BLAS
1063 */
1064 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1065 {
1066   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1067   PetscScalar       *z = 0,*work,*workt,*zarray;
1068   const PetscScalar *x,*xb;
1069   const MatScalar   *v;
1070   PetscErrorCode    ierr;
1071   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1072   const PetscInt    *idx,*ii,*ridx=NULL;
1073   PetscInt          ncols,k;
1074   PetscBool         usecprow=a->compressedrow.use;
1075 
1076   PetscFunctionBegin;
1077   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1078   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1079 
1080   idx = a->j;
1081   v   = a->a;
1082   if (usecprow) {
1083     mbs  = a->compressedrow.nrows;
1084     ii   = a->compressedrow.i;
1085     ridx = a->compressedrow.rindex;
1086     ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1087   } else {
1088     mbs = a->mbs;
1089     ii  = a->i;
1090     z   = zarray;
1091   }
1092 
1093   if (!a->mult_work) {
1094     k    = PetscMax(A->rmap->n,A->cmap->n);
1095     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1096   }
1097   work = a->mult_work;
1098   for (i=0; i<mbs; i++) {
1099     n           = ii[1] - ii[0]; ii++;
1100     ncols       = n*bs;
1101     workt       = work;
1102     for (j=0; j<n; j++) {
1103       xb = x + bs*(*idx++);
1104       for (k=0; k<bs; k++) workt[k] = xb[k];
1105       workt += bs;
1106     }
1107     if (usecprow) z = zarray + bs*ridx[i];
1108     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1109     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1110     v += n*bs2;
1111     if (!usecprow) z += bs;
1112   }
1113   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1114   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1115   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1120 {
1121   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1122   const PetscScalar *x;
1123   PetscScalar       *y,*z,sum;
1124   const MatScalar   *v;
1125   PetscErrorCode    ierr;
1126   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1127   const PetscInt    *idx,*ii;
1128   PetscBool         usecprow=a->compressedrow.use;
1129 
1130   PetscFunctionBegin;
1131   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1132   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1133 
1134   idx = a->j;
1135   v   = a->a;
1136   if (usecprow) {
1137     if (zz != yy) {
1138       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1139     }
1140     mbs  = a->compressedrow.nrows;
1141     ii   = a->compressedrow.i;
1142     ridx = a->compressedrow.rindex;
1143   } else {
1144     ii = a->i;
1145   }
1146 
1147   for (i=0; i<mbs; i++) {
1148     n = ii[1] - ii[0];
1149     ii++;
1150     if (!usecprow) {
1151       sum         = y[i];
1152     } else {
1153       sum = y[ridx[i]];
1154     }
1155     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1156     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1157     PetscSparseDensePlusDot(sum,x,v,idx,n);
1158     v   += n;
1159     idx += n;
1160     if (usecprow) {
1161       z[ridx[i]] = sum;
1162     } else {
1163       z[i] = sum;
1164     }
1165   }
1166   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1167   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1173 {
1174   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1175   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1176   const PetscScalar *x,*xb;
1177   PetscScalar       x1,x2,*yarray,*zarray;
1178   const MatScalar   *v;
1179   PetscErrorCode    ierr;
1180   PetscInt          mbs = a->mbs,i,n,j;
1181   const PetscInt    *idx,*ii,*ridx = NULL;
1182   PetscBool         usecprow = a->compressedrow.use;
1183 
1184   PetscFunctionBegin;
1185   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1186   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1187 
1188   idx = a->j;
1189   v   = a->a;
1190   if (usecprow) {
1191     if (zz != yy) {
1192       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1193     }
1194     mbs  = a->compressedrow.nrows;
1195     ii   = a->compressedrow.i;
1196     ridx = a->compressedrow.rindex;
1197   } else {
1198     ii = a->i;
1199     y  = yarray;
1200     z  = zarray;
1201   }
1202 
1203   for (i=0; i<mbs; i++) {
1204     n = ii[1] - ii[0]; ii++;
1205     if (usecprow) {
1206       z = zarray + 2*ridx[i];
1207       y = yarray + 2*ridx[i];
1208     }
1209     sum1 = y[0]; sum2 = y[1];
1210     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1211     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1212     for (j=0; j<n; j++) {
1213       xb = x + 2*(*idx++);
1214       x1 = xb[0];
1215       x2 = xb[1];
1216 
1217       sum1 += v[0]*x1 + v[2]*x2;
1218       sum2 += v[1]*x1 + v[3]*x2;
1219       v    += 4;
1220     }
1221     z[0] = sum1; z[1] = sum2;
1222     if (!usecprow) {
1223       z += 2; y += 2;
1224     }
1225   }
1226   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1227   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1228   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1233 {
1234   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1235   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1236   const PetscScalar *x,*xb;
1237   const MatScalar   *v;
1238   PetscErrorCode    ierr;
1239   PetscInt          mbs = a->mbs,i,j,n;
1240   const PetscInt    *idx,*ii,*ridx = NULL;
1241   PetscBool         usecprow = a->compressedrow.use;
1242 
1243   PetscFunctionBegin;
1244   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1245   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1246 
1247   idx = a->j;
1248   v   = a->a;
1249   if (usecprow) {
1250     if (zz != yy) {
1251       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1252     }
1253     mbs  = a->compressedrow.nrows;
1254     ii   = a->compressedrow.i;
1255     ridx = a->compressedrow.rindex;
1256   } else {
1257     ii = a->i;
1258     y  = yarray;
1259     z  = zarray;
1260   }
1261 
1262   for (i=0; i<mbs; i++) {
1263     n = ii[1] - ii[0]; ii++;
1264     if (usecprow) {
1265       z = zarray + 3*ridx[i];
1266       y = yarray + 3*ridx[i];
1267     }
1268     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1269     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1270     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1271     for (j=0; j<n; j++) {
1272       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1273       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1274       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1275       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1276       v    += 9;
1277     }
1278     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1279     if (!usecprow) {
1280       z += 3; y += 3;
1281     }
1282   }
1283   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1284   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1285   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1286   PetscFunctionReturn(0);
1287 }
1288 
1289 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1290 {
1291   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1292   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1293   const PetscScalar *x,*xb;
1294   const MatScalar   *v;
1295   PetscErrorCode    ierr;
1296   PetscInt          mbs = a->mbs,i,j,n;
1297   const PetscInt    *idx,*ii,*ridx=NULL;
1298   PetscBool         usecprow=a->compressedrow.use;
1299 
1300   PetscFunctionBegin;
1301   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1302   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1303 
1304   idx = a->j;
1305   v   = a->a;
1306   if (usecprow) {
1307     if (zz != yy) {
1308       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1309     }
1310     mbs  = a->compressedrow.nrows;
1311     ii   = a->compressedrow.i;
1312     ridx = a->compressedrow.rindex;
1313   } else {
1314     ii = a->i;
1315     y  = yarray;
1316     z  = zarray;
1317   }
1318 
1319   for (i=0; i<mbs; i++) {
1320     n = ii[1] - ii[0]; ii++;
1321     if (usecprow) {
1322       z = zarray + 4*ridx[i];
1323       y = yarray + 4*ridx[i];
1324     }
1325     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1326     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1327     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1328     for (j=0; j<n; j++) {
1329       xb    = x + 4*(*idx++);
1330       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1331       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1332       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1333       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1334       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1335       v    += 16;
1336     }
1337     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1338     if (!usecprow) {
1339       z += 4; y += 4;
1340     }
1341   }
1342   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1343   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1344   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1349 {
1350   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1351   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1352   const PetscScalar *x,*xb;
1353   PetscScalar       *yarray,*zarray;
1354   const MatScalar   *v;
1355   PetscErrorCode    ierr;
1356   PetscInt          mbs = a->mbs,i,j,n;
1357   const PetscInt    *idx,*ii,*ridx = NULL;
1358   PetscBool         usecprow=a->compressedrow.use;
1359 
1360   PetscFunctionBegin;
1361   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1362   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1363 
1364   idx = a->j;
1365   v   = a->a;
1366   if (usecprow) {
1367     if (zz != yy) {
1368       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1369     }
1370     mbs  = a->compressedrow.nrows;
1371     ii   = a->compressedrow.i;
1372     ridx = a->compressedrow.rindex;
1373   } else {
1374     ii = a->i;
1375     y  = yarray;
1376     z  = zarray;
1377   }
1378 
1379   for (i=0; i<mbs; i++) {
1380     n = ii[1] - ii[0]; ii++;
1381     if (usecprow) {
1382       z = zarray + 5*ridx[i];
1383       y = yarray + 5*ridx[i];
1384     }
1385     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1386     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1387     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1388     for (j=0; j<n; j++) {
1389       xb    = x + 5*(*idx++);
1390       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1391       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1392       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1393       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1394       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1395       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1396       v    += 25;
1397     }
1398     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1399     if (!usecprow) {
1400       z += 5; y += 5;
1401     }
1402   }
1403   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1404   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1405   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1409 {
1410   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1411   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1412   const PetscScalar *x,*xb;
1413   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1414   const MatScalar   *v;
1415   PetscErrorCode    ierr;
1416   PetscInt          mbs = a->mbs,i,j,n;
1417   const PetscInt    *idx,*ii,*ridx=NULL;
1418   PetscBool         usecprow=a->compressedrow.use;
1419 
1420   PetscFunctionBegin;
1421   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1422   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1423 
1424   idx = a->j;
1425   v   = a->a;
1426   if (usecprow) {
1427     if (zz != yy) {
1428       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1429     }
1430     mbs  = a->compressedrow.nrows;
1431     ii   = a->compressedrow.i;
1432     ridx = a->compressedrow.rindex;
1433   } else {
1434     ii = a->i;
1435     y  = yarray;
1436     z  = zarray;
1437   }
1438 
1439   for (i=0; i<mbs; i++) {
1440     n = ii[1] - ii[0]; ii++;
1441     if (usecprow) {
1442       z = zarray + 6*ridx[i];
1443       y = yarray + 6*ridx[i];
1444     }
1445     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1446     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1447     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1448     for (j=0; j<n; j++) {
1449       xb    = x + 6*(*idx++);
1450       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1451       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1452       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1453       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1454       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1455       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1456       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1457       v    += 36;
1458     }
1459     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1460     if (!usecprow) {
1461       z += 6; y += 6;
1462     }
1463   }
1464   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1465   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1466   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1467   PetscFunctionReturn(0);
1468 }
1469 
1470 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1471 {
1472   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1473   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1474   const PetscScalar *x,*xb;
1475   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1476   const MatScalar   *v;
1477   PetscErrorCode    ierr;
1478   PetscInt          mbs = a->mbs,i,j,n;
1479   const PetscInt    *idx,*ii,*ridx = NULL;
1480   PetscBool         usecprow=a->compressedrow.use;
1481 
1482   PetscFunctionBegin;
1483   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1484   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1485 
1486   idx = a->j;
1487   v   = a->a;
1488   if (usecprow) {
1489     if (zz != yy) {
1490       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1491     }
1492     mbs  = a->compressedrow.nrows;
1493     ii   = a->compressedrow.i;
1494     ridx = a->compressedrow.rindex;
1495   } else {
1496     ii = a->i;
1497     y  = yarray;
1498     z  = zarray;
1499   }
1500 
1501   for (i=0; i<mbs; i++) {
1502     n = ii[1] - ii[0]; ii++;
1503     if (usecprow) {
1504       z = zarray + 7*ridx[i];
1505       y = yarray + 7*ridx[i];
1506     }
1507     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1508     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1509     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1510     for (j=0; j<n; j++) {
1511       xb    = x + 7*(*idx++);
1512       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1513       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1514       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1515       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1516       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1517       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1518       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1519       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1520       v    += 49;
1521     }
1522     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1523     if (!usecprow) {
1524       z += 7; y += 7;
1525     }
1526   }
1527   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1528   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1529   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1530   PetscFunctionReturn(0);
1531 }
1532 
1533 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1534 {
1535   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1536   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1537   const PetscScalar *x,*xb;
1538   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1539   const MatScalar   *v;
1540   PetscErrorCode    ierr;
1541   PetscInt          mbs = a->mbs,i,j,n;
1542   const PetscInt    *idx,*ii,*ridx = NULL;
1543   PetscBool         usecprow=a->compressedrow.use;
1544 
1545   PetscFunctionBegin;
1546   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1547   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1548 
1549   idx = a->j;
1550   v   = a->a;
1551   if (usecprow) {
1552     if (zz != yy) {
1553       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1554     }
1555     mbs  = a->compressedrow.nrows;
1556     ii   = a->compressedrow.i;
1557     ridx = a->compressedrow.rindex;
1558   } else {
1559     ii = a->i;
1560     y  = yarray;
1561     z  = zarray;
1562   }
1563 
1564   for (i=0; i<mbs; i++) {
1565     n = ii[1] - ii[0]; ii++;
1566     if (usecprow) {
1567       z = zarray + 11*ridx[i];
1568       y = yarray + 11*ridx[i];
1569     }
1570     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1571     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1572     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1573     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1574     for (j=0; j<n; j++) {
1575       xb    = x + 11*(*idx++);
1576       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1577       sum1 += v[0]*x1 + v[11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1578       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1579       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1580       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1581       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1582       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1583       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1584       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1585       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1586       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1587       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1588       v    += 121;
1589     }
1590     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1591     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1592     if (!usecprow) {
1593       z += 11; y += 11;
1594     }
1595   }
1596   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1597   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1598   ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
1599   PetscFunctionReturn(0);
1600 }
1601 
1602 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1603 {
1604   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1605   PetscScalar       *z = 0,*work,*workt,*zarray;
1606   const PetscScalar *x,*xb;
1607   const MatScalar   *v;
1608   PetscErrorCode    ierr;
1609   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1610   PetscInt          ncols,k;
1611   const PetscInt    *ridx = NULL,*idx,*ii;
1612   PetscBool         usecprow = a->compressedrow.use;
1613 
1614   PetscFunctionBegin;
1615   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1616   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1617   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1618 
1619   idx = a->j;
1620   v   = a->a;
1621   if (usecprow) {
1622     mbs  = a->compressedrow.nrows;
1623     ii   = a->compressedrow.i;
1624     ridx = a->compressedrow.rindex;
1625   } else {
1626     mbs = a->mbs;
1627     ii  = a->i;
1628     z   = zarray;
1629   }
1630 
1631   if (!a->mult_work) {
1632     k    = PetscMax(A->rmap->n,A->cmap->n);
1633     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1634   }
1635   work = a->mult_work;
1636   for (i=0; i<mbs; i++) {
1637     n     = ii[1] - ii[0]; ii++;
1638     ncols = n*bs;
1639     workt = work;
1640     for (j=0; j<n; j++) {
1641       xb = x + bs*(*idx++);
1642       for (k=0; k<bs; k++) workt[k] = xb[k];
1643       workt += bs;
1644     }
1645     if (usecprow) z = zarray + bs*ridx[i];
1646     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1647     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1648     v += n*bs2;
1649     if (!usecprow) z += bs;
1650   }
1651   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1652   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1653   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1654   PetscFunctionReturn(0);
1655 }
1656 
1657 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1658 {
1659   PetscScalar    zero = 0.0;
1660   PetscErrorCode ierr;
1661 
1662   PetscFunctionBegin;
1663   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1664   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1665   PetscFunctionReturn(0);
1666 }
1667 
1668 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1669 {
1670   PetscScalar    zero = 0.0;
1671   PetscErrorCode ierr;
1672 
1673   PetscFunctionBegin;
1674   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1675   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1676   PetscFunctionReturn(0);
1677 }
1678 
1679 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1680 {
1681   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1682   PetscScalar       *z,x1,x2,x3,x4,x5;
1683   const PetscScalar *x,*xb = NULL;
1684   const MatScalar   *v;
1685   PetscErrorCode    ierr;
1686   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1687   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1688   Mat_CompressedRow cprow = a->compressedrow;
1689   PetscBool         usecprow = cprow.use;
1690 
1691   PetscFunctionBegin;
1692   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1693   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1694   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1695 
1696   idx = a->j;
1697   v   = a->a;
1698   if (usecprow) {
1699     mbs  = cprow.nrows;
1700     ii   = cprow.i;
1701     ridx = cprow.rindex;
1702   } else {
1703     mbs=a->mbs;
1704     ii = a->i;
1705     xb = x;
1706   }
1707 
1708   switch (bs) {
1709   case 1:
1710     for (i=0; i<mbs; i++) {
1711       if (usecprow) xb = x + ridx[i];
1712       x1 = xb[0];
1713       ib = idx + ii[0];
1714       n  = ii[1] - ii[0]; ii++;
1715       for (j=0; j<n; j++) {
1716         rval     = ib[j];
1717         z[rval] += PetscConj(*v) * x1;
1718         v++;
1719       }
1720       if (!usecprow) xb++;
1721     }
1722     break;
1723   case 2:
1724     for (i=0; i<mbs; i++) {
1725       if (usecprow) xb = x + 2*ridx[i];
1726       x1 = xb[0]; x2 = xb[1];
1727       ib = idx + ii[0];
1728       n  = ii[1] - ii[0]; ii++;
1729       for (j=0; j<n; j++) {
1730         rval       = ib[j]*2;
1731         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1732         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1733         v         += 4;
1734       }
1735       if (!usecprow) xb += 2;
1736     }
1737     break;
1738   case 3:
1739     for (i=0; i<mbs; i++) {
1740       if (usecprow) xb = x + 3*ridx[i];
1741       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1742       ib = idx + ii[0];
1743       n  = ii[1] - ii[0]; ii++;
1744       for (j=0; j<n; j++) {
1745         rval       = ib[j]*3;
1746         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1747         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1748         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1749         v         += 9;
1750       }
1751       if (!usecprow) xb += 3;
1752     }
1753     break;
1754   case 4:
1755     for (i=0; i<mbs; i++) {
1756       if (usecprow) xb = x + 4*ridx[i];
1757       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1758       ib = idx + ii[0];
1759       n  = ii[1] - ii[0]; ii++;
1760       for (j=0; j<n; j++) {
1761         rval       = ib[j]*4;
1762         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1763         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1764         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1765         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1766         v         += 16;
1767       }
1768       if (!usecprow) xb += 4;
1769     }
1770     break;
1771   case 5:
1772     for (i=0; i<mbs; i++) {
1773       if (usecprow) xb = x + 5*ridx[i];
1774       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1775       x4 = xb[3]; x5 = xb[4];
1776       ib = idx + ii[0];
1777       n  = ii[1] - ii[0]; ii++;
1778       for (j=0; j<n; j++) {
1779         rval       = ib[j]*5;
1780         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1781         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1782         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1783         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1784         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1785         v         += 25;
1786       }
1787       if (!usecprow) xb += 5;
1788     }
1789     break;
1790   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1791     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1792 #if 0
1793     {
1794       PetscInt          ncols,k,bs2=a->bs2;
1795       PetscScalar       *work,*workt,zb;
1796       const PetscScalar *xtmp;
1797       if (!a->mult_work) {
1798         k    = PetscMax(A->rmap->n,A->cmap->n);
1799         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1800       }
1801       work = a->mult_work;
1802       xtmp = x;
1803       for (i=0; i<mbs; i++) {
1804         n     = ii[1] - ii[0]; ii++;
1805         ncols = n*bs;
1806         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1807         if (usecprow) xtmp = x + bs*ridx[i];
1808         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1809         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1810         v += n*bs2;
1811         if (!usecprow) xtmp += bs;
1812         workt = work;
1813         for (j=0; j<n; j++) {
1814           zb = z + bs*(*idx++);
1815           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1816           workt += bs;
1817         }
1818       }
1819     }
1820 #endif
1821   }
1822   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1823   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1824   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1825   PetscFunctionReturn(0);
1826 }
1827 
1828 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1829 {
1830   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1831   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
1832   const PetscScalar *x,*xb = 0;
1833   const MatScalar   *v;
1834   PetscErrorCode    ierr;
1835   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1836   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1837   Mat_CompressedRow cprow   = a->compressedrow;
1838   PetscBool         usecprow=cprow.use;
1839 
1840   PetscFunctionBegin;
1841   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1842   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1843   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1844 
1845   idx = a->j;
1846   v   = a->a;
1847   if (usecprow) {
1848     mbs  = cprow.nrows;
1849     ii   = cprow.i;
1850     ridx = cprow.rindex;
1851   } else {
1852     mbs=a->mbs;
1853     ii = a->i;
1854     xb = x;
1855   }
1856 
1857   switch (bs) {
1858   case 1:
1859     for (i=0; i<mbs; i++) {
1860       if (usecprow) xb = x + ridx[i];
1861       x1 = xb[0];
1862       ib = idx + ii[0];
1863       n  = ii[1] - ii[0]; ii++;
1864       for (j=0; j<n; j++) {
1865         rval     = ib[j];
1866         z[rval] += *v * x1;
1867         v++;
1868       }
1869       if (!usecprow) xb++;
1870     }
1871     break;
1872   case 2:
1873     for (i=0; i<mbs; i++) {
1874       if (usecprow) xb = x + 2*ridx[i];
1875       x1 = xb[0]; x2 = xb[1];
1876       ib = idx + ii[0];
1877       n  = ii[1] - ii[0]; ii++;
1878       for (j=0; j<n; j++) {
1879         rval       = ib[j]*2;
1880         z[rval++] += v[0]*x1 + v[1]*x2;
1881         z[rval++] += v[2]*x1 + v[3]*x2;
1882         v         += 4;
1883       }
1884       if (!usecprow) xb += 2;
1885     }
1886     break;
1887   case 3:
1888     for (i=0; i<mbs; i++) {
1889       if (usecprow) xb = x + 3*ridx[i];
1890       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1891       ib = idx + ii[0];
1892       n  = ii[1] - ii[0]; ii++;
1893       for (j=0; j<n; j++) {
1894         rval       = ib[j]*3;
1895         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1896         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1897         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1898         v         += 9;
1899       }
1900       if (!usecprow) xb += 3;
1901     }
1902     break;
1903   case 4:
1904     for (i=0; i<mbs; i++) {
1905       if (usecprow) xb = x + 4*ridx[i];
1906       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1907       ib = idx + ii[0];
1908       n  = ii[1] - ii[0]; ii++;
1909       for (j=0; j<n; j++) {
1910         rval       = ib[j]*4;
1911         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1912         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1913         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1914         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1915         v         += 16;
1916       }
1917       if (!usecprow) xb += 4;
1918     }
1919     break;
1920   case 5:
1921     for (i=0; i<mbs; i++) {
1922       if (usecprow) xb = x + 5*ridx[i];
1923       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1924       x4 = xb[3]; x5 = xb[4];
1925       ib = idx + ii[0];
1926       n  = ii[1] - ii[0]; ii++;
1927       for (j=0; j<n; j++) {
1928         rval       = ib[j]*5;
1929         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1930         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1931         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1932         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1933         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1934         v         += 25;
1935       }
1936       if (!usecprow) xb += 5;
1937     }
1938     break;
1939   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1940     PetscInt          ncols,k;
1941     PetscScalar       *work,*workt;
1942     const PetscScalar *xtmp;
1943     if (!a->mult_work) {
1944       k    = PetscMax(A->rmap->n,A->cmap->n);
1945       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1946     }
1947     work = a->mult_work;
1948     xtmp = x;
1949     for (i=0; i<mbs; i++) {
1950       n     = ii[1] - ii[0]; ii++;
1951       ncols = n*bs;
1952       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1953       if (usecprow) xtmp = x + bs*ridx[i];
1954       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1955       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1956       v += n*bs2;
1957       if (!usecprow) xtmp += bs;
1958       workt = work;
1959       for (j=0; j<n; j++) {
1960         zb = z + bs*(*idx++);
1961         for (k=0; k<bs; k++) zb[k] += workt[k];
1962         workt += bs;
1963       }
1964     }
1965     }
1966   }
1967   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1968   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1969   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1970   PetscFunctionReturn(0);
1971 }
1972 
1973 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1974 {
1975   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1976   PetscInt       totalnz = a->bs2*a->nz;
1977   PetscScalar    oalpha  = alpha;
1978   PetscErrorCode ierr;
1979   PetscBLASInt   one = 1,tnz;
1980 
1981   PetscFunctionBegin;
1982   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
1983   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1984   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1985   PetscFunctionReturn(0);
1986 }
1987 
1988 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1989 {
1990   PetscErrorCode ierr;
1991   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
1992   MatScalar      *v  = a->a;
1993   PetscReal      sum = 0.0;
1994   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1995 
1996   PetscFunctionBegin;
1997   if (type == NORM_FROBENIUS) {
1998 #if defined(PETSC_USE_REAL___FP16)
1999     PetscBLASInt one = 1,cnt = bs2*nz;
2000     *norm = BLASnrm2_(&cnt,v,&one);
2001 #else
2002     for (i=0; i<bs2*nz; i++) {
2003       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2004     }
2005 #endif
2006     *norm = PetscSqrtReal(sum);
2007     ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr);
2008   } else if (type == NORM_1) { /* maximum column sum */
2009     PetscReal *tmp;
2010     PetscInt  *bcol = a->j;
2011     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2012     for (i=0; i<nz; i++) {
2013       for (j=0; j<bs; j++) {
2014         k1 = bs*(*bcol) + j; /* column index */
2015         for (k=0; k<bs; k++) {
2016           tmp[k1] += PetscAbsScalar(*v); v++;
2017         }
2018       }
2019       bcol++;
2020     }
2021     *norm = 0.0;
2022     for (j=0; j<A->cmap->n; j++) {
2023       if (tmp[j] > *norm) *norm = tmp[j];
2024     }
2025     ierr = PetscFree(tmp);CHKERRQ(ierr);
2026     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2027   } else if (type == NORM_INFINITY) { /* maximum row sum */
2028     *norm = 0.0;
2029     for (k=0; k<bs; k++) {
2030       for (j=0; j<a->mbs; j++) {
2031         v   = a->a + bs2*a->i[j] + k;
2032         sum = 0.0;
2033         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2034           for (k1=0; k1<bs; k1++) {
2035             sum += PetscAbsScalar(*v);
2036             v   += bs;
2037           }
2038         }
2039         if (sum > *norm) *norm = sum;
2040       }
2041     }
2042     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2043   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2044   PetscFunctionReturn(0);
2045 }
2046 
2047 
2048 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2049 {
2050   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2051   PetscErrorCode ierr;
2052 
2053   PetscFunctionBegin;
2054   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2055   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2056     *flg = PETSC_FALSE;
2057     PetscFunctionReturn(0);
2058   }
2059 
2060   /* if the a->i are the same */
2061   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2062   if (!*flg) PetscFunctionReturn(0);
2063 
2064   /* if a->j are the same */
2065   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2066   if (!*flg) PetscFunctionReturn(0);
2067 
2068   /* if a->a are the same */
2069   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2070   PetscFunctionReturn(0);
2071 
2072 }
2073 
2074 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2075 {
2076   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2077   PetscErrorCode ierr;
2078   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2079   PetscScalar    *x,zero = 0.0;
2080   MatScalar      *aa,*aa_j;
2081 
2082   PetscFunctionBegin;
2083   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2084   bs   = A->rmap->bs;
2085   aa   = a->a;
2086   ai   = a->i;
2087   aj   = a->j;
2088   ambs = a->mbs;
2089   bs2  = a->bs2;
2090 
2091   ierr = VecSet(v,zero);CHKERRQ(ierr);
2092   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2093   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2094   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2095   for (i=0; i<ambs; i++) {
2096     for (j=ai[i]; j<ai[i+1]; j++) {
2097       if (aj[j] == i) {
2098         row  = i*bs;
2099         aa_j = aa+j*bs2;
2100         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2101         break;
2102       }
2103     }
2104   }
2105   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2106   PetscFunctionReturn(0);
2107 }
2108 
2109 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2110 {
2111   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2112   const PetscScalar *l,*r,*li,*ri;
2113   PetscScalar       x;
2114   MatScalar         *aa, *v;
2115   PetscErrorCode    ierr;
2116   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2117   const PetscInt    *ai,*aj;
2118 
2119   PetscFunctionBegin;
2120   ai  = a->i;
2121   aj  = a->j;
2122   aa  = a->a;
2123   m   = A->rmap->n;
2124   n   = A->cmap->n;
2125   bs  = A->rmap->bs;
2126   mbs = a->mbs;
2127   bs2 = a->bs2;
2128   if (ll) {
2129     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2130     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2131     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2132     for (i=0; i<mbs; i++) { /* for each block row */
2133       M  = ai[i+1] - ai[i];
2134       li = l + i*bs;
2135       v  = aa + bs2*ai[i];
2136       for (j=0; j<M; j++) { /* for each block */
2137         for (k=0; k<bs2; k++) {
2138           (*v++) *= li[k%bs];
2139         }
2140       }
2141     }
2142     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2143     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2144   }
2145 
2146   if (rr) {
2147     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2148     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2149     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2150     for (i=0; i<mbs; i++) { /* for each block row */
2151       iai = ai[i];
2152       M   = ai[i+1] - iai;
2153       v   = aa + bs2*iai;
2154       for (j=0; j<M; j++) { /* for each block */
2155         ri = r + bs*aj[iai+j];
2156         for (k=0; k<bs; k++) {
2157           x = ri[k];
2158           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2159           v += bs;
2160         }
2161       }
2162     }
2163     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2164     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2165   }
2166   PetscFunctionReturn(0);
2167 }
2168 
2169 
2170 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2171 {
2172   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2173 
2174   PetscFunctionBegin;
2175   info->block_size   = a->bs2;
2176   info->nz_allocated = a->bs2*a->maxnz;
2177   info->nz_used      = a->bs2*a->nz;
2178   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2179   info->assemblies   = A->num_ass;
2180   info->mallocs      = A->info.mallocs;
2181   info->memory       = ((PetscObject)A)->mem;
2182   if (A->factortype) {
2183     info->fill_ratio_given  = A->info.fill_ratio_given;
2184     info->fill_ratio_needed = A->info.fill_ratio_needed;
2185     info->factor_mallocs    = A->info.factor_mallocs;
2186   } else {
2187     info->fill_ratio_given  = 0;
2188     info->fill_ratio_needed = 0;
2189     info->factor_mallocs    = 0;
2190   }
2191   PetscFunctionReturn(0);
2192 }
2193 
2194 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2195 {
2196   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2197   PetscErrorCode ierr;
2198 
2199   PetscFunctionBegin;
2200   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2201   PetscFunctionReturn(0);
2202 }
2203