xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 9f4d3c52fa2fe0bb72fec4f4e85d8e495867af35)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
8 {
9   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
10   PetscErrorCode ierr;
11   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
12   const PetscInt *idx;
13   PetscInt       start,end,*ai,*aj,bs,*nidx2;
14   PetscBT        table;
15 
16   PetscFunctionBegin;
17   m  = a->mbs;
18   ai = a->i;
19   aj = a->j;
20   bs = A->rmap->bs;
21 
22   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23 
24   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
25   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27 
28   for (i=0; i<is_max; i++) {
29     /* Initialise the two local arrays */
30     isz  = 0;
31     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
38     for (j=0; j<n; ++j) {
39       ival = idx[j]/bs; /* convert the indices into block indices */
40       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
42     }
43     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
44     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
45 
46     k = 0;
47     for (j=0; j<ov; j++) { /* for each overlap*/
48       n = isz;
49       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
50         row   = nidx[k];
51         start = ai[row];
52         end   = ai[row+1];
53         for (l = start; l<end; l++) {
54           val = aj[l];
55           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
56         }
57       }
58     }
59     /* expand the Index Set */
60     for (j=0; j<isz; j++) {
61       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
62     }
63     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
64   }
65   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
66   ierr = PetscFree(nidx);CHKERRQ(ierr);
67   ierr = PetscFree(nidx2);CHKERRQ(ierr);
68   PetscFunctionReturn(0);
69 }
70 
71 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
72 {
73   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
74   PetscErrorCode ierr;
75   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
76   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
77   const PetscInt *irow,*icol;
78   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
79   PetscInt       *aj = a->j,*ai = a->i;
80   MatScalar      *mat_a;
81   Mat            C;
82   PetscBool      flag;
83 
84   PetscFunctionBegin;
85   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
86   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
87   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
88   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
89 
90   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
91   ssmap = smap;
92   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
93   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
94   /* determine lens of each row */
95   for (i=0; i<nrows; i++) {
96     kstart  = ai[irow[i]];
97     kend    = kstart + a->ilen[irow[i]];
98     lens[i] = 0;
99     for (k=kstart; k<kend; k++) {
100       if (ssmap[aj[k]]) lens[i]++;
101     }
102   }
103   /* Create and fill new matrix */
104   if (scall == MAT_REUSE_MATRIX) {
105     c = (Mat_SeqBAIJ*)((*B)->data);
106 
107     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
108     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
109     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
110     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
111     C    = *B;
112   } else {
113     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
114     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
115     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
116     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
117   }
118   c = (Mat_SeqBAIJ*)(C->data);
119   for (i=0; i<nrows; i++) {
120     row      = irow[i];
121     kstart   = ai[row];
122     kend     = kstart + a->ilen[row];
123     mat_i    = c->i[i];
124     mat_j    = c->j + mat_i;
125     mat_a    = c->a + mat_i*bs2;
126     mat_ilen = c->ilen + i;
127     for (k=kstart; k<kend; k++) {
128       if ((tcol=ssmap[a->j[k]])) {
129         *mat_j++ = tcol - 1;
130         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
131         mat_a   += bs2;
132         (*mat_ilen)++;
133       }
134     }
135   }
136   /* sort */
137   {
138     MatScalar *work;
139     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
140     for (i=0; i<nrows; i++) {
141       PetscInt ilen;
142       mat_i = c->i[i];
143       mat_j = c->j + mat_i;
144       mat_a = c->a + mat_i*bs2;
145       ilen  = c->ilen[i];
146       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
147     }
148     ierr = PetscFree(work);CHKERRQ(ierr);
149   }
150 
151   /* Free work space */
152   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
153   ierr = PetscFree(smap);CHKERRQ(ierr);
154   ierr = PetscFree(lens);CHKERRQ(ierr);
155   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
157 
158   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
159   *B   = C;
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164 {
165   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166   IS             is1,is2;
167   PetscErrorCode ierr;
168   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
169   const PetscInt *irow,*icol;
170 
171   PetscFunctionBegin;
172   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
173   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
174   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
176 
177   /* Verify if the indices corespond to each element in a block
178    and form the IS with compressed IS */
179   maxmnbs = PetscMax(a->mbs,a->nbs);
180   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
181   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
182   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183   for (i=0; i<a->mbs; i++) {
184     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185   }
186   count = 0;
187   for (i=0; i<nrows; i++) {
188     j = irow[i] / bs;
189     if ((vary[j]--)==bs) iary[count++] = j;
190   }
191   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
192 
193   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
194   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
195   for (i=0; i<a->nbs; i++) {
196     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
197   }
198   count = 0;
199   for (i=0; i<ncols; i++) {
200     j = icol[i] / bs;
201     if ((vary[j]--)==bs) iary[count++] = j;
202   }
203   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
204   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
206   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
207 
208   ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
209   ierr = ISDestroy(&is1);CHKERRQ(ierr);
210   ierr = ISDestroy(&is2);CHKERRQ(ierr);
211   PetscFunctionReturn(0);
212 }
213 
214 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
215 {
216   PetscErrorCode ierr;
217   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
218   Mat_SubSppt    *submatj = c->submatis1;
219 
220   PetscFunctionBegin;
221   ierr = submatj->destroy(C);CHKERRQ(ierr);
222   ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
223   PetscFunctionReturn(0);
224 }
225 
226 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
227 {
228   PetscErrorCode ierr;
229   PetscInt       i;
230   Mat            C;
231   Mat_SeqBAIJ    *c;
232   Mat_SubSppt    *submatj;
233 
234   PetscFunctionBegin;
235   for (i=0; i<n; i++) {
236     C       = (*mat)[i];
237     c       = (Mat_SeqBAIJ*)C->data;
238     submatj = c->submatis1;
239     if (submatj) {
240       ierr = submatj->destroy(C);CHKERRQ(ierr);
241       ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
242       ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr);
243       ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr);
244       ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr);
245     } else {
246       ierr = MatDestroy(&C);CHKERRQ(ierr);
247     }
248   }
249   ierr = PetscFree(*mat);CHKERRQ(ierr);
250   PetscFunctionReturn(0);
251 }
252 
253 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
254 {
255   PetscErrorCode ierr;
256   PetscInt       i;
257 
258   PetscFunctionBegin;
259   if (scall == MAT_INITIAL_MATRIX) {
260     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
261   }
262 
263   for (i=0; i<n; i++) {
264     ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
265   }
266   PetscFunctionReturn(0);
267 }
268 
269 /* -------------------------------------------------------*/
270 /* Should check that shapes of vectors and matrices match */
271 /* -------------------------------------------------------*/
272 
273 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
274 {
275   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
276   PetscScalar       *z,sum;
277   const PetscScalar *x;
278   const MatScalar   *v;
279   PetscErrorCode    ierr;
280   PetscInt          mbs,i,n;
281   const PetscInt    *idx,*ii,*ridx=NULL;
282   PetscBool         usecprow=a->compressedrow.use;
283 
284   PetscFunctionBegin;
285   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
286   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
287 
288   if (usecprow) {
289     mbs  = a->compressedrow.nrows;
290     ii   = a->compressedrow.i;
291     ridx = a->compressedrow.rindex;
292     ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
293   } else {
294     mbs = a->mbs;
295     ii  = a->i;
296   }
297 
298   for (i=0; i<mbs; i++) {
299     n   = ii[1] - ii[0];
300     v   = a->a + ii[0];
301     idx = a->j + ii[0];
302     ii++;
303     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
304     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
305     sum = 0.0;
306     PetscSparseDensePlusDot(sum,x,v,idx,n);
307     if (usecprow) {
308       z[ridx[i]] = sum;
309     } else {
310       z[i]       = sum;
311     }
312   }
313   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
314   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
315   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
316   PetscFunctionReturn(0);
317 }
318 
319 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
320 {
321   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
322   PetscScalar       *z = 0,sum1,sum2,*zarray;
323   const PetscScalar *x,*xb;
324   PetscScalar       x1,x2;
325   const MatScalar   *v;
326   PetscErrorCode    ierr;
327   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
328   PetscBool         usecprow=a->compressedrow.use;
329 
330   PetscFunctionBegin;
331   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
332   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
333 
334   idx = a->j;
335   v   = a->a;
336   if (usecprow) {
337     mbs  = a->compressedrow.nrows;
338     ii   = a->compressedrow.i;
339     ridx = a->compressedrow.rindex;
340     ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
341   } else {
342     mbs = a->mbs;
343     ii  = a->i;
344     z   = zarray;
345   }
346 
347   for (i=0; i<mbs; i++) {
348     n           = ii[1] - ii[0]; ii++;
349     sum1        = 0.0; sum2 = 0.0;
350     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
351     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
352     for (j=0; j<n; j++) {
353       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
354       sum1 += v[0]*x1 + v[2]*x2;
355       sum2 += v[1]*x1 + v[3]*x2;
356       v    += 4;
357     }
358     if (usecprow) z = zarray + 2*ridx[i];
359     z[0] = sum1; z[1] = sum2;
360     if (!usecprow) z += 2;
361   }
362   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
363   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
364   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
365   PetscFunctionReturn(0);
366 }
367 
368 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
369 {
370   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
371   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
372   const PetscScalar *x,*xb;
373   const MatScalar   *v;
374   PetscErrorCode    ierr;
375   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
376   PetscBool         usecprow=a->compressedrow.use;
377 
378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379 #pragma disjoint(*v,*z,*xb)
380 #endif
381 
382   PetscFunctionBegin;
383   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
384   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
385 
386   idx = a->j;
387   v   = a->a;
388   if (usecprow) {
389     mbs  = a->compressedrow.nrows;
390     ii   = a->compressedrow.i;
391     ridx = a->compressedrow.rindex;
392     ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
393   } else {
394     mbs = a->mbs;
395     ii  = a->i;
396     z   = zarray;
397   }
398 
399   for (i=0; i<mbs; i++) {
400     n           = ii[1] - ii[0]; ii++;
401     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
402     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
403     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
404     for (j=0; j<n; j++) {
405       xb = x + 3*(*idx++);
406       x1 = xb[0];
407       x2 = xb[1];
408       x3 = xb[2];
409 
410       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
411       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
412       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
413       v    += 9;
414     }
415     if (usecprow) z = zarray + 3*ridx[i];
416     z[0] = sum1; z[1] = sum2; z[2] = sum3;
417     if (!usecprow) z += 3;
418   }
419   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
420   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
421   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
422   PetscFunctionReturn(0);
423 }
424 
425 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
426 {
427   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
428   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
429   const PetscScalar *x,*xb;
430   const MatScalar   *v;
431   PetscErrorCode    ierr;
432   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
433   PetscBool         usecprow=a->compressedrow.use;
434 
435   PetscFunctionBegin;
436   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
437   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
438 
439   idx = a->j;
440   v   = a->a;
441   if (usecprow) {
442     mbs  = a->compressedrow.nrows;
443     ii   = a->compressedrow.i;
444     ridx = a->compressedrow.rindex;
445     ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
446   } else {
447     mbs = a->mbs;
448     ii  = a->i;
449     z   = zarray;
450   }
451 
452   for (i=0; i<mbs; i++) {
453     n = ii[1] - ii[0];
454     ii++;
455     sum1 = 0.0;
456     sum2 = 0.0;
457     sum3 = 0.0;
458     sum4 = 0.0;
459 
460     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
461     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
462     for (j=0; j<n; j++) {
463       xb    = x + 4*(*idx++);
464       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
465       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
466       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
467       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
468       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
469       v    += 16;
470     }
471     if (usecprow) z = zarray + 4*ridx[i];
472     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
473     if (!usecprow) z += 4;
474   }
475   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
476   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
477   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
478   PetscFunctionReturn(0);
479 }
480 
481 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
482 {
483   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
484   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
485   const PetscScalar *xb,*x;
486   const MatScalar   *v;
487   PetscErrorCode    ierr;
488   const PetscInt    *idx,*ii,*ridx=NULL;
489   PetscInt          mbs,i,j,n;
490   PetscBool         usecprow=a->compressedrow.use;
491 
492   PetscFunctionBegin;
493   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
494   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
495 
496   idx = a->j;
497   v   = a->a;
498   if (usecprow) {
499     mbs  = a->compressedrow.nrows;
500     ii   = a->compressedrow.i;
501     ridx = a->compressedrow.rindex;
502     ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
503   } else {
504     mbs = a->mbs;
505     ii  = a->i;
506     z   = zarray;
507   }
508 
509   for (i=0; i<mbs; i++) {
510     n           = ii[1] - ii[0]; ii++;
511     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
512     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
513     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
514     for (j=0; j<n; j++) {
515       xb    = x + 5*(*idx++);
516       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
517       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
518       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
519       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
520       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
521       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
522       v    += 25;
523     }
524     if (usecprow) z = zarray + 5*ridx[i];
525     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
526     if (!usecprow) z += 5;
527   }
528   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
529   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
530   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
531   PetscFunctionReturn(0);
532 }
533 
534 
535 
536 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
537 {
538   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
539   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
540   const PetscScalar *x,*xb;
541   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
542   const MatScalar   *v;
543   PetscErrorCode    ierr;
544   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
545   PetscBool         usecprow=a->compressedrow.use;
546 
547   PetscFunctionBegin;
548   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
549   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
550 
551   idx = a->j;
552   v   = a->a;
553   if (usecprow) {
554     mbs  = a->compressedrow.nrows;
555     ii   = a->compressedrow.i;
556     ridx = a->compressedrow.rindex;
557     ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
558   } else {
559     mbs = a->mbs;
560     ii  = a->i;
561     z   = zarray;
562   }
563 
564   for (i=0; i<mbs; i++) {
565     n  = ii[1] - ii[0];
566     ii++;
567     sum1 = 0.0;
568     sum2 = 0.0;
569     sum3 = 0.0;
570     sum4 = 0.0;
571     sum5 = 0.0;
572     sum6 = 0.0;
573 
574     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
575     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
576     for (j=0; j<n; j++) {
577       xb    = x + 6*(*idx++);
578       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
579       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
580       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
581       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
582       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
583       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
584       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
585       v    += 36;
586     }
587     if (usecprow) z = zarray + 6*ridx[i];
588     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
589     if (!usecprow) z += 6;
590   }
591 
592   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
593   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
594   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
595   PetscFunctionReturn(0);
596 }
597 
598 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
599 {
600   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
601   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
602   const PetscScalar *x,*xb;
603   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
604   const MatScalar   *v;
605   PetscErrorCode    ierr;
606   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
607   PetscBool         usecprow=a->compressedrow.use;
608 
609   PetscFunctionBegin;
610   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
611   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
612 
613   idx = a->j;
614   v   = a->a;
615   if (usecprow) {
616     mbs  = a->compressedrow.nrows;
617     ii   = a->compressedrow.i;
618     ridx = a->compressedrow.rindex;
619     ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
620   } else {
621     mbs = a->mbs;
622     ii  = a->i;
623     z   = zarray;
624   }
625 
626   for (i=0; i<mbs; i++) {
627     n  = ii[1] - ii[0];
628     ii++;
629     sum1 = 0.0;
630     sum2 = 0.0;
631     sum3 = 0.0;
632     sum4 = 0.0;
633     sum5 = 0.0;
634     sum6 = 0.0;
635     sum7 = 0.0;
636 
637     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
638     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
639     for (j=0; j<n; j++) {
640       xb    = x + 7*(*idx++);
641       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
642       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
643       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
644       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
645       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
646       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
647       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
648       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
649       v    += 49;
650     }
651     if (usecprow) z = zarray + 7*ridx[i];
652     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
653     if (!usecprow) z += 7;
654   }
655 
656   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
657   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
658   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
659   PetscFunctionReturn(0);
660 }
661 
662 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
663 {
664   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
665   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
666   const PetscScalar *x,*xb;
667   PetscScalar       *zarray,xv;
668   const MatScalar   *v;
669   PetscErrorCode    ierr;
670   const PetscInt    *ii,*ij=a->j,*idx;
671   PetscInt          mbs,i,j,k,n,*ridx=NULL;
672   PetscBool         usecprow=a->compressedrow.use;
673 
674   PetscFunctionBegin;
675   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
676   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
677 
678   v = a->a;
679   if (usecprow) {
680     mbs  = a->compressedrow.nrows;
681     ii   = a->compressedrow.i;
682     ridx = a->compressedrow.rindex;
683     ierr = PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
684   } else {
685     mbs = a->mbs;
686     ii  = a->i;
687     z   = zarray;
688   }
689 
690   for (i=0; i<mbs; i++) {
691     n    = ii[i+1] - ii[i];
692     idx  = ij + ii[i];
693     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
694     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
695 
696     for (j=0; j<n; j++) {
697       xb = x + 11*(idx[j]);
698 
699       for (k=0; k<11; k++) {
700         xv     =  xb[k];
701         sum1  += v[0]*xv;
702         sum2  += v[1]*xv;
703         sum3  += v[2]*xv;
704         sum4  += v[3]*xv;
705         sum5  += v[4]*xv;
706         sum6  += v[5]*xv;
707         sum7  += v[6]*xv;
708         sum8  += v[7]*xv;
709         sum9  += v[8]*xv;
710         sum10 += v[9]*xv;
711         sum11 += v[10]*xv;
712         v     += 11;
713       }
714     }
715     if (usecprow) z = zarray + 11*ridx[i];
716     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
717     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
718 
719     if (!usecprow) z += 11;
720   }
721 
722   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
723   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
724   ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
725   PetscFunctionReturn(0);
726 }
727 
728 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
729 /* Default MatMult for block size 15 */
730 
731 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
732 {
733   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
734   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
735   const PetscScalar *x,*xb;
736   PetscScalar       *zarray,xv;
737   const MatScalar   *v;
738   PetscErrorCode    ierr;
739   const PetscInt    *ii,*ij=a->j,*idx;
740   PetscInt          mbs,i,j,k,n,*ridx=NULL;
741   PetscBool         usecprow=a->compressedrow.use;
742 
743   PetscFunctionBegin;
744   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
745   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
746 
747   v = a->a;
748   if (usecprow) {
749     mbs  = a->compressedrow.nrows;
750     ii   = a->compressedrow.i;
751     ridx = a->compressedrow.rindex;
752     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
753   } else {
754     mbs = a->mbs;
755     ii  = a->i;
756     z   = zarray;
757   }
758 
759   for (i=0; i<mbs; i++) {
760     n    = ii[i+1] - ii[i];
761     idx  = ij + ii[i];
762     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
763     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
764 
765     for (j=0; j<n; j++) {
766       xb = x + 15*(idx[j]);
767 
768       for (k=0; k<15; k++) {
769         xv     =  xb[k];
770         sum1  += v[0]*xv;
771         sum2  += v[1]*xv;
772         sum3  += v[2]*xv;
773         sum4  += v[3]*xv;
774         sum5  += v[4]*xv;
775         sum6  += v[5]*xv;
776         sum7  += v[6]*xv;
777         sum8  += v[7]*xv;
778         sum9  += v[8]*xv;
779         sum10 += v[9]*xv;
780         sum11 += v[10]*xv;
781         sum12 += v[11]*xv;
782         sum13 += v[12]*xv;
783         sum14 += v[13]*xv;
784         sum15 += v[14]*xv;
785         v     += 15;
786       }
787     }
788     if (usecprow) z = zarray + 15*ridx[i];
789     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
790     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
791 
792     if (!usecprow) z += 15;
793   }
794 
795   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
796   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
797   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
798   PetscFunctionReturn(0);
799 }
800 
801 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
802 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
803 {
804   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
805   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
806   const PetscScalar *x,*xb;
807   PetscScalar       x1,x2,x3,x4,*zarray;
808   const MatScalar   *v;
809   PetscErrorCode    ierr;
810   const PetscInt    *ii,*ij=a->j,*idx;
811   PetscInt          mbs,i,j,n,*ridx=NULL;
812   PetscBool         usecprow=a->compressedrow.use;
813 
814   PetscFunctionBegin;
815   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
816   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
817 
818   v = a->a;
819   if (usecprow) {
820     mbs  = a->compressedrow.nrows;
821     ii   = a->compressedrow.i;
822     ridx = a->compressedrow.rindex;
823     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
824   } else {
825     mbs = a->mbs;
826     ii  = a->i;
827     z   = zarray;
828   }
829 
830   for (i=0; i<mbs; i++) {
831     n    = ii[i+1] - ii[i];
832     idx  = ij + ii[i];
833     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
834     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
835 
836     for (j=0; j<n; j++) {
837       xb = x + 15*(idx[j]);
838       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
839 
840       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
841       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
842       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
843       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
844       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
845       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
846       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
847       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
848       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
849       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
850       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
851       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
852       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
853       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
854       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
855 
856       v += 60;
857 
858       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
859 
860       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
861       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
862       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
863       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
864       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
865       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
866       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
867       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
868       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
869       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
870       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
871       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
872       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
873       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
874       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
875       v     += 60;
876 
877       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
878       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
879       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
880       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
881       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
882       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
883       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
884       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
885       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
886       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
887       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
888       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
889       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
890       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
891       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
892       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
893       v     += 60;
894 
895       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
896       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
897       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
898       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
899       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
900       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
901       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
902       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
903       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
904       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
905       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
906       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
907       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
908       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
909       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
910       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
911       v     += 45;
912     }
913     if (usecprow) z = zarray + 15*ridx[i];
914     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
915     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
916 
917     if (!usecprow) z += 15;
918   }
919 
920   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
921   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
922   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
923   PetscFunctionReturn(0);
924 }
925 
926 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
927 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
928 {
929   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
930   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
931   const PetscScalar *x,*xb;
932   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
933   const MatScalar   *v;
934   PetscErrorCode    ierr;
935   const PetscInt    *ii,*ij=a->j,*idx;
936   PetscInt          mbs,i,j,n,*ridx=NULL;
937   PetscBool         usecprow=a->compressedrow.use;
938 
939   PetscFunctionBegin;
940   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
941   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
942 
943   v = a->a;
944   if (usecprow) {
945     mbs  = a->compressedrow.nrows;
946     ii   = a->compressedrow.i;
947     ridx = a->compressedrow.rindex;
948     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
949   } else {
950     mbs = a->mbs;
951     ii  = a->i;
952     z   = zarray;
953   }
954 
955   for (i=0; i<mbs; i++) {
956     n    = ii[i+1] - ii[i];
957     idx  = ij + ii[i];
958     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
959     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
960 
961     for (j=0; j<n; j++) {
962       xb = x + 15*(idx[j]);
963       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
964       x8 = xb[7];
965 
966       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
967       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
968       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
969       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
970       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
971       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
972       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
973       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
974       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
975       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
976       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
977       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
978       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
979       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
980       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
981       v     += 120;
982 
983       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
984 
985       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
986       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
987       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
988       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
989       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
990       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
991       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
992       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
993       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
994       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
995       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
996       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
997       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
998       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
999       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1000       v     += 105;
1001     }
1002     if (usecprow) z = zarray + 15*ridx[i];
1003     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1004     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1005 
1006     if (!usecprow) z += 15;
1007   }
1008 
1009   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1010   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1011   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1012   PetscFunctionReturn(0);
1013 }
1014 
1015 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1016 
1017 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1018 {
1019   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1020   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1021   const PetscScalar *x,*xb;
1022   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1023   const MatScalar   *v;
1024   PetscErrorCode    ierr;
1025   const PetscInt    *ii,*ij=a->j,*idx;
1026   PetscInt          mbs,i,j,n,*ridx=NULL;
1027   PetscBool         usecprow=a->compressedrow.use;
1028 
1029   PetscFunctionBegin;
1030   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1031   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1032 
1033   v = a->a;
1034   if (usecprow) {
1035     mbs  = a->compressedrow.nrows;
1036     ii   = a->compressedrow.i;
1037     ridx = a->compressedrow.rindex;
1038     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1039   } else {
1040     mbs = a->mbs;
1041     ii  = a->i;
1042     z   = zarray;
1043   }
1044 
1045   for (i=0; i<mbs; i++) {
1046     n    = ii[i+1] - ii[i];
1047     idx  = ij + ii[i];
1048     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1049     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1050 
1051     for (j=0; j<n; j++) {
1052       xb = x + 15*(idx[j]);
1053       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1054       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1055 
1056       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1057       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1058       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1059       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1060       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1061       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1062       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1063       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1064       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1065       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1066       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1067       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1068       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1069       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1070       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1071       v     += 225;
1072     }
1073     if (usecprow) z = zarray + 15*ridx[i];
1074     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1075     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1076 
1077     if (!usecprow) z += 15;
1078   }
1079 
1080   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1081   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1082   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1083   PetscFunctionReturn(0);
1084 }
1085 
1086 
1087 /*
1088     This will not work with MatScalar == float because it calls the BLAS
1089 */
1090 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1091 {
1092   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1093   PetscScalar       *z = 0,*work,*workt,*zarray;
1094   const PetscScalar *x,*xb;
1095   const MatScalar   *v;
1096   PetscErrorCode    ierr;
1097   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1098   const PetscInt    *idx,*ii,*ridx=NULL;
1099   PetscInt          ncols,k;
1100   PetscBool         usecprow=a->compressedrow.use;
1101 
1102   PetscFunctionBegin;
1103   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1104   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1105 
1106   idx = a->j;
1107   v   = a->a;
1108   if (usecprow) {
1109     mbs  = a->compressedrow.nrows;
1110     ii   = a->compressedrow.i;
1111     ridx = a->compressedrow.rindex;
1112     ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1113   } else {
1114     mbs = a->mbs;
1115     ii  = a->i;
1116     z   = zarray;
1117   }
1118 
1119   if (!a->mult_work) {
1120     k    = PetscMax(A->rmap->n,A->cmap->n);
1121     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1122   }
1123   work = a->mult_work;
1124   for (i=0; i<mbs; i++) {
1125     n           = ii[1] - ii[0]; ii++;
1126     ncols       = n*bs;
1127     workt       = work;
1128     for (j=0; j<n; j++) {
1129       xb = x + bs*(*idx++);
1130       for (k=0; k<bs; k++) workt[k] = xb[k];
1131       workt += bs;
1132     }
1133     if (usecprow) z = zarray + bs*ridx[i];
1134     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1135     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1136     v += n*bs2;
1137     if (!usecprow) z += bs;
1138   }
1139   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1140   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1141   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1146 {
1147   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1148   const PetscScalar *x;
1149   PetscScalar       *y,*z,sum;
1150   const MatScalar   *v;
1151   PetscErrorCode    ierr;
1152   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1153   const PetscInt    *idx,*ii;
1154   PetscBool         usecprow=a->compressedrow.use;
1155 
1156   PetscFunctionBegin;
1157   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1158   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1159 
1160   idx = a->j;
1161   v   = a->a;
1162   if (usecprow) {
1163     if (zz != yy) {
1164       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1165     }
1166     mbs  = a->compressedrow.nrows;
1167     ii   = a->compressedrow.i;
1168     ridx = a->compressedrow.rindex;
1169   } else {
1170     ii = a->i;
1171   }
1172 
1173   for (i=0; i<mbs; i++) {
1174     n = ii[1] - ii[0];
1175     ii++;
1176     if (!usecprow) {
1177       sum         = y[i];
1178     } else {
1179       sum = y[ridx[i]];
1180     }
1181     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1182     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1183     PetscSparseDensePlusDot(sum,x,v,idx,n);
1184     v   += n;
1185     idx += n;
1186     if (usecprow) {
1187       z[ridx[i]] = sum;
1188     } else {
1189       z[i] = sum;
1190     }
1191   }
1192   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1193   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1194   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1195   PetscFunctionReturn(0);
1196 }
1197 
1198 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1199 {
1200   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1201   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1202   const PetscScalar *x,*xb;
1203   PetscScalar       x1,x2,*yarray,*zarray;
1204   const MatScalar   *v;
1205   PetscErrorCode    ierr;
1206   PetscInt          mbs = a->mbs,i,n,j;
1207   const PetscInt    *idx,*ii,*ridx = NULL;
1208   PetscBool         usecprow = a->compressedrow.use;
1209 
1210   PetscFunctionBegin;
1211   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1212   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1213 
1214   idx = a->j;
1215   v   = a->a;
1216   if (usecprow) {
1217     if (zz != yy) {
1218       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1219     }
1220     mbs  = a->compressedrow.nrows;
1221     ii   = a->compressedrow.i;
1222     ridx = a->compressedrow.rindex;
1223   } else {
1224     ii = a->i;
1225     y  = yarray;
1226     z  = zarray;
1227   }
1228 
1229   for (i=0; i<mbs; i++) {
1230     n = ii[1] - ii[0]; ii++;
1231     if (usecprow) {
1232       z = zarray + 2*ridx[i];
1233       y = yarray + 2*ridx[i];
1234     }
1235     sum1 = y[0]; sum2 = y[1];
1236     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1237     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1238     for (j=0; j<n; j++) {
1239       xb = x + 2*(*idx++);
1240       x1 = xb[0];
1241       x2 = xb[1];
1242 
1243       sum1 += v[0]*x1 + v[2]*x2;
1244       sum2 += v[1]*x1 + v[3]*x2;
1245       v    += 4;
1246     }
1247     z[0] = sum1; z[1] = sum2;
1248     if (!usecprow) {
1249       z += 2; y += 2;
1250     }
1251   }
1252   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1253   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1254   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1255   PetscFunctionReturn(0);
1256 }
1257 
1258 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1259 {
1260   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1261   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1262   const PetscScalar *x,*xb;
1263   const MatScalar   *v;
1264   PetscErrorCode    ierr;
1265   PetscInt          mbs = a->mbs,i,j,n;
1266   const PetscInt    *idx,*ii,*ridx = NULL;
1267   PetscBool         usecprow = a->compressedrow.use;
1268 
1269   PetscFunctionBegin;
1270   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1271   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1272 
1273   idx = a->j;
1274   v   = a->a;
1275   if (usecprow) {
1276     if (zz != yy) {
1277       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1278     }
1279     mbs  = a->compressedrow.nrows;
1280     ii   = a->compressedrow.i;
1281     ridx = a->compressedrow.rindex;
1282   } else {
1283     ii = a->i;
1284     y  = yarray;
1285     z  = zarray;
1286   }
1287 
1288   for (i=0; i<mbs; i++) {
1289     n = ii[1] - ii[0]; ii++;
1290     if (usecprow) {
1291       z = zarray + 3*ridx[i];
1292       y = yarray + 3*ridx[i];
1293     }
1294     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1295     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1296     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1297     for (j=0; j<n; j++) {
1298       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1299       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1300       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1301       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1302       v    += 9;
1303     }
1304     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1305     if (!usecprow) {
1306       z += 3; y += 3;
1307     }
1308   }
1309   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1310   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1311   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1316 {
1317   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1318   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1319   const PetscScalar *x,*xb;
1320   const MatScalar   *v;
1321   PetscErrorCode    ierr;
1322   PetscInt          mbs = a->mbs,i,j,n;
1323   const PetscInt    *idx,*ii,*ridx=NULL;
1324   PetscBool         usecprow=a->compressedrow.use;
1325 
1326   PetscFunctionBegin;
1327   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1328   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1329 
1330   idx = a->j;
1331   v   = a->a;
1332   if (usecprow) {
1333     if (zz != yy) {
1334       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1335     }
1336     mbs  = a->compressedrow.nrows;
1337     ii   = a->compressedrow.i;
1338     ridx = a->compressedrow.rindex;
1339   } else {
1340     ii = a->i;
1341     y  = yarray;
1342     z  = zarray;
1343   }
1344 
1345   for (i=0; i<mbs; i++) {
1346     n = ii[1] - ii[0]; ii++;
1347     if (usecprow) {
1348       z = zarray + 4*ridx[i];
1349       y = yarray + 4*ridx[i];
1350     }
1351     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1352     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1353     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1354     for (j=0; j<n; j++) {
1355       xb    = x + 4*(*idx++);
1356       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1357       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1358       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1359       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1360       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1361       v    += 16;
1362     }
1363     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1364     if (!usecprow) {
1365       z += 4; y += 4;
1366     }
1367   }
1368   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1369   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1370   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1371   PetscFunctionReturn(0);
1372 }
1373 
1374 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1375 {
1376   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1377   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1378   const PetscScalar *x,*xb;
1379   PetscScalar       *yarray,*zarray;
1380   const MatScalar   *v;
1381   PetscErrorCode    ierr;
1382   PetscInt          mbs = a->mbs,i,j,n;
1383   const PetscInt    *idx,*ii,*ridx = NULL;
1384   PetscBool         usecprow=a->compressedrow.use;
1385 
1386   PetscFunctionBegin;
1387   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1388   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1389 
1390   idx = a->j;
1391   v   = a->a;
1392   if (usecprow) {
1393     if (zz != yy) {
1394       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1395     }
1396     mbs  = a->compressedrow.nrows;
1397     ii   = a->compressedrow.i;
1398     ridx = a->compressedrow.rindex;
1399   } else {
1400     ii = a->i;
1401     y  = yarray;
1402     z  = zarray;
1403   }
1404 
1405   for (i=0; i<mbs; i++) {
1406     n = ii[1] - ii[0]; ii++;
1407     if (usecprow) {
1408       z = zarray + 5*ridx[i];
1409       y = yarray + 5*ridx[i];
1410     }
1411     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1412     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1413     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1414     for (j=0; j<n; j++) {
1415       xb    = x + 5*(*idx++);
1416       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1417       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1418       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1419       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1420       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1421       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1422       v    += 25;
1423     }
1424     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1425     if (!usecprow) {
1426       z += 5; y += 5;
1427     }
1428   }
1429   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1430   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1431   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1432   PetscFunctionReturn(0);
1433 }
1434 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1435 {
1436   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1437   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1438   const PetscScalar *x,*xb;
1439   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1440   const MatScalar   *v;
1441   PetscErrorCode    ierr;
1442   PetscInt          mbs = a->mbs,i,j,n;
1443   const PetscInt    *idx,*ii,*ridx=NULL;
1444   PetscBool         usecprow=a->compressedrow.use;
1445 
1446   PetscFunctionBegin;
1447   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1448   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1449 
1450   idx = a->j;
1451   v   = a->a;
1452   if (usecprow) {
1453     if (zz != yy) {
1454       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1455     }
1456     mbs  = a->compressedrow.nrows;
1457     ii   = a->compressedrow.i;
1458     ridx = a->compressedrow.rindex;
1459   } else {
1460     ii = a->i;
1461     y  = yarray;
1462     z  = zarray;
1463   }
1464 
1465   for (i=0; i<mbs; i++) {
1466     n = ii[1] - ii[0]; ii++;
1467     if (usecprow) {
1468       z = zarray + 6*ridx[i];
1469       y = yarray + 6*ridx[i];
1470     }
1471     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1472     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1473     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1474     for (j=0; j<n; j++) {
1475       xb    = x + 6*(*idx++);
1476       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1477       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1478       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1479       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1480       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1481       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1482       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1483       v    += 36;
1484     }
1485     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1486     if (!usecprow) {
1487       z += 6; y += 6;
1488     }
1489   }
1490   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1491   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1492   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1493   PetscFunctionReturn(0);
1494 }
1495 
1496 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1497 {
1498   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1499   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1500   const PetscScalar *x,*xb;
1501   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1502   const MatScalar   *v;
1503   PetscErrorCode    ierr;
1504   PetscInt          mbs = a->mbs,i,j,n;
1505   const PetscInt    *idx,*ii,*ridx = NULL;
1506   PetscBool         usecprow=a->compressedrow.use;
1507 
1508   PetscFunctionBegin;
1509   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1510   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1511 
1512   idx = a->j;
1513   v   = a->a;
1514   if (usecprow) {
1515     if (zz != yy) {
1516       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1517     }
1518     mbs  = a->compressedrow.nrows;
1519     ii   = a->compressedrow.i;
1520     ridx = a->compressedrow.rindex;
1521   } else {
1522     ii = a->i;
1523     y  = yarray;
1524     z  = zarray;
1525   }
1526 
1527   for (i=0; i<mbs; i++) {
1528     n = ii[1] - ii[0]; ii++;
1529     if (usecprow) {
1530       z = zarray + 7*ridx[i];
1531       y = yarray + 7*ridx[i];
1532     }
1533     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1534     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1535     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1536     for (j=0; j<n; j++) {
1537       xb    = x + 7*(*idx++);
1538       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1539       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1540       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1541       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1542       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1543       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1544       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1545       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1546       v    += 49;
1547     }
1548     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1549     if (!usecprow) {
1550       z += 7; y += 7;
1551     }
1552   }
1553   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1554   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1555   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1560 {
1561   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1562   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1563   const PetscScalar *x,*xb;
1564   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1565   const MatScalar   *v;
1566   PetscErrorCode    ierr;
1567   PetscInt          mbs = a->mbs,i,j,n;
1568   const PetscInt    *idx,*ii,*ridx = NULL;
1569   PetscBool         usecprow=a->compressedrow.use;
1570 
1571   PetscFunctionBegin;
1572   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1573   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1574 
1575   idx = a->j;
1576   v   = a->a;
1577   if (usecprow) {
1578     if (zz != yy) {
1579       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1580     }
1581     mbs  = a->compressedrow.nrows;
1582     ii   = a->compressedrow.i;
1583     ridx = a->compressedrow.rindex;
1584   } else {
1585     ii = a->i;
1586     y  = yarray;
1587     z  = zarray;
1588   }
1589 
1590   for (i=0; i<mbs; i++) {
1591     n = ii[1] - ii[0]; ii++;
1592     if (usecprow) {
1593       z = zarray + 11*ridx[i];
1594       y = yarray + 11*ridx[i];
1595     }
1596     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1597     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1598     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1599     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1600     for (j=0; j<n; j++) {
1601       xb    = x + 11*(*idx++);
1602       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1603       sum1 += v[0]*x1 + v[11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1604       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1605       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1606       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1607       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1608       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1609       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1610       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1611       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1612       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1613       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1614       v    += 121;
1615     }
1616     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1617     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1618     if (!usecprow) {
1619       z += 11; y += 11;
1620     }
1621   }
1622   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1623   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1624   ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
1625   PetscFunctionReturn(0);
1626 }
1627 
1628 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1629 {
1630   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1631   PetscScalar       *z = 0,*work,*workt,*zarray;
1632   const PetscScalar *x,*xb;
1633   const MatScalar   *v;
1634   PetscErrorCode    ierr;
1635   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1636   PetscInt          ncols,k;
1637   const PetscInt    *ridx = NULL,*idx,*ii;
1638   PetscBool         usecprow = a->compressedrow.use;
1639 
1640   PetscFunctionBegin;
1641   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1642   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1643   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1644 
1645   idx = a->j;
1646   v   = a->a;
1647   if (usecprow) {
1648     mbs  = a->compressedrow.nrows;
1649     ii   = a->compressedrow.i;
1650     ridx = a->compressedrow.rindex;
1651   } else {
1652     mbs = a->mbs;
1653     ii  = a->i;
1654     z   = zarray;
1655   }
1656 
1657   if (!a->mult_work) {
1658     k    = PetscMax(A->rmap->n,A->cmap->n);
1659     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1660   }
1661   work = a->mult_work;
1662   for (i=0; i<mbs; i++) {
1663     n     = ii[1] - ii[0]; ii++;
1664     ncols = n*bs;
1665     workt = work;
1666     for (j=0; j<n; j++) {
1667       xb = x + bs*(*idx++);
1668       for (k=0; k<bs; k++) workt[k] = xb[k];
1669       workt += bs;
1670     }
1671     if (usecprow) z = zarray + bs*ridx[i];
1672     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1673     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1674     v += n*bs2;
1675     if (!usecprow) z += bs;
1676   }
1677   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1678   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1679   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1680   PetscFunctionReturn(0);
1681 }
1682 
1683 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1684 {
1685   PetscScalar    zero = 0.0;
1686   PetscErrorCode ierr;
1687 
1688   PetscFunctionBegin;
1689   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1690   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1691   PetscFunctionReturn(0);
1692 }
1693 
1694 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1695 {
1696   PetscScalar    zero = 0.0;
1697   PetscErrorCode ierr;
1698 
1699   PetscFunctionBegin;
1700   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1701   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704 
1705 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1706 {
1707   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1708   PetscScalar       *z,x1,x2,x3,x4,x5;
1709   const PetscScalar *x,*xb = NULL;
1710   const MatScalar   *v;
1711   PetscErrorCode    ierr;
1712   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1713   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1714   Mat_CompressedRow cprow = a->compressedrow;
1715   PetscBool         usecprow = cprow.use;
1716 
1717   PetscFunctionBegin;
1718   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1719   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1720   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1721 
1722   idx = a->j;
1723   v   = a->a;
1724   if (usecprow) {
1725     mbs  = cprow.nrows;
1726     ii   = cprow.i;
1727     ridx = cprow.rindex;
1728   } else {
1729     mbs=a->mbs;
1730     ii = a->i;
1731     xb = x;
1732   }
1733 
1734   switch (bs) {
1735   case 1:
1736     for (i=0; i<mbs; i++) {
1737       if (usecprow) xb = x + ridx[i];
1738       x1 = xb[0];
1739       ib = idx + ii[0];
1740       n  = ii[1] - ii[0]; ii++;
1741       for (j=0; j<n; j++) {
1742         rval     = ib[j];
1743         z[rval] += PetscConj(*v) * x1;
1744         v++;
1745       }
1746       if (!usecprow) xb++;
1747     }
1748     break;
1749   case 2:
1750     for (i=0; i<mbs; i++) {
1751       if (usecprow) xb = x + 2*ridx[i];
1752       x1 = xb[0]; x2 = xb[1];
1753       ib = idx + ii[0];
1754       n  = ii[1] - ii[0]; ii++;
1755       for (j=0; j<n; j++) {
1756         rval       = ib[j]*2;
1757         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1758         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1759         v         += 4;
1760       }
1761       if (!usecprow) xb += 2;
1762     }
1763     break;
1764   case 3:
1765     for (i=0; i<mbs; i++) {
1766       if (usecprow) xb = x + 3*ridx[i];
1767       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1768       ib = idx + ii[0];
1769       n  = ii[1] - ii[0]; ii++;
1770       for (j=0; j<n; j++) {
1771         rval       = ib[j]*3;
1772         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1773         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1774         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1775         v         += 9;
1776       }
1777       if (!usecprow) xb += 3;
1778     }
1779     break;
1780   case 4:
1781     for (i=0; i<mbs; i++) {
1782       if (usecprow) xb = x + 4*ridx[i];
1783       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1784       ib = idx + ii[0];
1785       n  = ii[1] - ii[0]; ii++;
1786       for (j=0; j<n; j++) {
1787         rval       = ib[j]*4;
1788         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1789         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1790         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1791         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1792         v         += 16;
1793       }
1794       if (!usecprow) xb += 4;
1795     }
1796     break;
1797   case 5:
1798     for (i=0; i<mbs; i++) {
1799       if (usecprow) xb = x + 5*ridx[i];
1800       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1801       x4 = xb[3]; x5 = xb[4];
1802       ib = idx + ii[0];
1803       n  = ii[1] - ii[0]; ii++;
1804       for (j=0; j<n; j++) {
1805         rval       = ib[j]*5;
1806         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1807         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1808         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1809         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1810         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1811         v         += 25;
1812       }
1813       if (!usecprow) xb += 5;
1814     }
1815     break;
1816   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1817     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1818 #if 0
1819     {
1820       PetscInt          ncols,k,bs2=a->bs2;
1821       PetscScalar       *work,*workt,zb;
1822       const PetscScalar *xtmp;
1823       if (!a->mult_work) {
1824         k    = PetscMax(A->rmap->n,A->cmap->n);
1825         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1826       }
1827       work = a->mult_work;
1828       xtmp = x;
1829       for (i=0; i<mbs; i++) {
1830         n     = ii[1] - ii[0]; ii++;
1831         ncols = n*bs;
1832         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1833         if (usecprow) xtmp = x + bs*ridx[i];
1834         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1835         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1836         v += n*bs2;
1837         if (!usecprow) xtmp += bs;
1838         workt = work;
1839         for (j=0; j<n; j++) {
1840           zb = z + bs*(*idx++);
1841           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1842           workt += bs;
1843         }
1844       }
1845     }
1846 #endif
1847   }
1848   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1849   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1850   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1851   PetscFunctionReturn(0);
1852 }
1853 
1854 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1855 {
1856   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1857   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
1858   const PetscScalar *x,*xb = 0;
1859   const MatScalar   *v;
1860   PetscErrorCode    ierr;
1861   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1862   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1863   Mat_CompressedRow cprow   = a->compressedrow;
1864   PetscBool         usecprow=cprow.use;
1865 
1866   PetscFunctionBegin;
1867   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1868   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1869   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1870 
1871   idx = a->j;
1872   v   = a->a;
1873   if (usecprow) {
1874     mbs  = cprow.nrows;
1875     ii   = cprow.i;
1876     ridx = cprow.rindex;
1877   } else {
1878     mbs=a->mbs;
1879     ii = a->i;
1880     xb = x;
1881   }
1882 
1883   switch (bs) {
1884   case 1:
1885     for (i=0; i<mbs; i++) {
1886       if (usecprow) xb = x + ridx[i];
1887       x1 = xb[0];
1888       ib = idx + ii[0];
1889       n  = ii[1] - ii[0]; ii++;
1890       for (j=0; j<n; j++) {
1891         rval     = ib[j];
1892         z[rval] += *v * x1;
1893         v++;
1894       }
1895       if (!usecprow) xb++;
1896     }
1897     break;
1898   case 2:
1899     for (i=0; i<mbs; i++) {
1900       if (usecprow) xb = x + 2*ridx[i];
1901       x1 = xb[0]; x2 = xb[1];
1902       ib = idx + ii[0];
1903       n  = ii[1] - ii[0]; ii++;
1904       for (j=0; j<n; j++) {
1905         rval       = ib[j]*2;
1906         z[rval++] += v[0]*x1 + v[1]*x2;
1907         z[rval++] += v[2]*x1 + v[3]*x2;
1908         v         += 4;
1909       }
1910       if (!usecprow) xb += 2;
1911     }
1912     break;
1913   case 3:
1914     for (i=0; i<mbs; i++) {
1915       if (usecprow) xb = x + 3*ridx[i];
1916       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1917       ib = idx + ii[0];
1918       n  = ii[1] - ii[0]; ii++;
1919       for (j=0; j<n; j++) {
1920         rval       = ib[j]*3;
1921         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1922         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1923         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1924         v         += 9;
1925       }
1926       if (!usecprow) xb += 3;
1927     }
1928     break;
1929   case 4:
1930     for (i=0; i<mbs; i++) {
1931       if (usecprow) xb = x + 4*ridx[i];
1932       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1933       ib = idx + ii[0];
1934       n  = ii[1] - ii[0]; ii++;
1935       for (j=0; j<n; j++) {
1936         rval       = ib[j]*4;
1937         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1938         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1939         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1940         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1941         v         += 16;
1942       }
1943       if (!usecprow) xb += 4;
1944     }
1945     break;
1946   case 5:
1947     for (i=0; i<mbs; i++) {
1948       if (usecprow) xb = x + 5*ridx[i];
1949       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1950       x4 = xb[3]; x5 = xb[4];
1951       ib = idx + ii[0];
1952       n  = ii[1] - ii[0]; ii++;
1953       for (j=0; j<n; j++) {
1954         rval       = ib[j]*5;
1955         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1956         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1957         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1958         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1959         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1960         v         += 25;
1961       }
1962       if (!usecprow) xb += 5;
1963     }
1964     break;
1965   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1966     PetscInt          ncols,k;
1967     PetscScalar       *work,*workt;
1968     const PetscScalar *xtmp;
1969     if (!a->mult_work) {
1970       k    = PetscMax(A->rmap->n,A->cmap->n);
1971       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1972     }
1973     work = a->mult_work;
1974     xtmp = x;
1975     for (i=0; i<mbs; i++) {
1976       n     = ii[1] - ii[0]; ii++;
1977       ncols = n*bs;
1978       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1979       if (usecprow) xtmp = x + bs*ridx[i];
1980       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1981       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1982       v += n*bs2;
1983       if (!usecprow) xtmp += bs;
1984       workt = work;
1985       for (j=0; j<n; j++) {
1986         zb = z + bs*(*idx++);
1987         for (k=0; k<bs; k++) zb[k] += workt[k];
1988         workt += bs;
1989       }
1990     }
1991     }
1992   }
1993   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1994   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1995   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1996   PetscFunctionReturn(0);
1997 }
1998 
1999 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2000 {
2001   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2002   PetscInt       totalnz = a->bs2*a->nz;
2003   PetscScalar    oalpha  = alpha;
2004   PetscErrorCode ierr;
2005   PetscBLASInt   one = 1,tnz;
2006 
2007   PetscFunctionBegin;
2008   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2009   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2010   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2011   PetscFunctionReturn(0);
2012 }
2013 
2014 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2015 {
2016   PetscErrorCode ierr;
2017   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2018   MatScalar      *v  = a->a;
2019   PetscReal      sum = 0.0;
2020   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2021 
2022   PetscFunctionBegin;
2023   if (type == NORM_FROBENIUS) {
2024 #if defined(PETSC_USE_REAL___FP16)
2025     PetscBLASInt one = 1,cnt = bs2*nz;
2026     *norm = BLASnrm2_(&cnt,v,&one);
2027 #else
2028     for (i=0; i<bs2*nz; i++) {
2029       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2030     }
2031 #endif
2032     *norm = PetscSqrtReal(sum);
2033     ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr);
2034   } else if (type == NORM_1) { /* maximum column sum */
2035     PetscReal *tmp;
2036     PetscInt  *bcol = a->j;
2037     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2038     for (i=0; i<nz; i++) {
2039       for (j=0; j<bs; j++) {
2040         k1 = bs*(*bcol) + j; /* column index */
2041         for (k=0; k<bs; k++) {
2042           tmp[k1] += PetscAbsScalar(*v); v++;
2043         }
2044       }
2045       bcol++;
2046     }
2047     *norm = 0.0;
2048     for (j=0; j<A->cmap->n; j++) {
2049       if (tmp[j] > *norm) *norm = tmp[j];
2050     }
2051     ierr = PetscFree(tmp);CHKERRQ(ierr);
2052     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2053   } else if (type == NORM_INFINITY) { /* maximum row sum */
2054     *norm = 0.0;
2055     for (k=0; k<bs; k++) {
2056       for (j=0; j<a->mbs; j++) {
2057         v   = a->a + bs2*a->i[j] + k;
2058         sum = 0.0;
2059         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2060           for (k1=0; k1<bs; k1++) {
2061             sum += PetscAbsScalar(*v);
2062             v   += bs;
2063           }
2064         }
2065         if (sum > *norm) *norm = sum;
2066       }
2067     }
2068     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2069   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2070   PetscFunctionReturn(0);
2071 }
2072 
2073 
2074 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2075 {
2076   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2077   PetscErrorCode ierr;
2078 
2079   PetscFunctionBegin;
2080   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2081   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2082     *flg = PETSC_FALSE;
2083     PetscFunctionReturn(0);
2084   }
2085 
2086   /* if the a->i are the same */
2087   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2088   if (!*flg) PetscFunctionReturn(0);
2089 
2090   /* if a->j are the same */
2091   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
2092   if (!*flg) PetscFunctionReturn(0);
2093 
2094   /* if a->a are the same */
2095   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2096   PetscFunctionReturn(0);
2097 
2098 }
2099 
2100 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2101 {
2102   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2103   PetscErrorCode ierr;
2104   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2105   PetscScalar    *x,zero = 0.0;
2106   MatScalar      *aa,*aa_j;
2107 
2108   PetscFunctionBegin;
2109   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2110   bs   = A->rmap->bs;
2111   aa   = a->a;
2112   ai   = a->i;
2113   aj   = a->j;
2114   ambs = a->mbs;
2115   bs2  = a->bs2;
2116 
2117   ierr = VecSet(v,zero);CHKERRQ(ierr);
2118   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2119   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2120   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2121   for (i=0; i<ambs; i++) {
2122     for (j=ai[i]; j<ai[i+1]; j++) {
2123       if (aj[j] == i) {
2124         row  = i*bs;
2125         aa_j = aa+j*bs2;
2126         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2127         break;
2128       }
2129     }
2130   }
2131   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2132   PetscFunctionReturn(0);
2133 }
2134 
2135 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2136 {
2137   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2138   const PetscScalar *l,*r,*li,*ri;
2139   PetscScalar       x;
2140   MatScalar         *aa, *v;
2141   PetscErrorCode    ierr;
2142   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2143   const PetscInt    *ai,*aj;
2144 
2145   PetscFunctionBegin;
2146   ai  = a->i;
2147   aj  = a->j;
2148   aa  = a->a;
2149   m   = A->rmap->n;
2150   n   = A->cmap->n;
2151   bs  = A->rmap->bs;
2152   mbs = a->mbs;
2153   bs2 = a->bs2;
2154   if (ll) {
2155     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2156     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2157     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2158     for (i=0; i<mbs; i++) { /* for each block row */
2159       M  = ai[i+1] - ai[i];
2160       li = l + i*bs;
2161       v  = aa + bs2*ai[i];
2162       for (j=0; j<M; j++) { /* for each block */
2163         for (k=0; k<bs2; k++) {
2164           (*v++) *= li[k%bs];
2165         }
2166       }
2167     }
2168     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2169     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2170   }
2171 
2172   if (rr) {
2173     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2174     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2175     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2176     for (i=0; i<mbs; i++) { /* for each block row */
2177       iai = ai[i];
2178       M   = ai[i+1] - iai;
2179       v   = aa + bs2*iai;
2180       for (j=0; j<M; j++) { /* for each block */
2181         ri = r + bs*aj[iai+j];
2182         for (k=0; k<bs; k++) {
2183           x = ri[k];
2184           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2185           v += bs;
2186         }
2187       }
2188     }
2189     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2190     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2191   }
2192   PetscFunctionReturn(0);
2193 }
2194 
2195 
2196 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2197 {
2198   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2199 
2200   PetscFunctionBegin;
2201   info->block_size   = a->bs2;
2202   info->nz_allocated = a->bs2*a->maxnz;
2203   info->nz_used      = a->bs2*a->nz;
2204   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2205   info->assemblies   = A->num_ass;
2206   info->mallocs      = A->info.mallocs;
2207   info->memory       = ((PetscObject)A)->mem;
2208   if (A->factortype) {
2209     info->fill_ratio_given  = A->info.fill_ratio_given;
2210     info->fill_ratio_needed = A->info.fill_ratio_needed;
2211     info->factor_mallocs    = A->info.factor_mallocs;
2212   } else {
2213     info->fill_ratio_given  = 0;
2214     info->fill_ratio_needed = 0;
2215     info->factor_mallocs    = 0;
2216   }
2217   PetscFunctionReturn(0);
2218 }
2219 
2220 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2221 {
2222   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2223   PetscErrorCode ierr;
2224 
2225   PetscFunctionBegin;
2226   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2227   PetscFunctionReturn(0);
2228 }
2229