xref: /petsc/src/mat/impls/baij/seq/baij.c (revision e0f5bfbec699682fa3e8b8532b1176849ea4e12a)
1 
2 /*
3     Defines the basic matrix operations for the BAIJ (compressed row)
4   matrix storage format.
5 */
6 #include <../src/mat/impls/baij/seq/baij.h> /*I   "petscmat.h"  I*/
7 #include <petscblaslapack.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 #include <petsc/private/kernels/blockmatmult.h>
10 
11 #if defined(PETSC_HAVE_HYPRE)
12 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
13 #endif
14 
15 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
16 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat, MatType, MatReuse, Mat *);
17 #endif
18 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
19 
20 PetscErrorCode MatGetColumnReductions_SeqBAIJ(Mat A, PetscInt type, PetscReal *reductions) {
21   Mat_SeqBAIJ *a_aij = (Mat_SeqBAIJ *)A->data;
22   PetscInt     m, n, i;
23   PetscInt     ib, jb, bs = A->rmap->bs;
24   MatScalar   *a_val = a_aij->a;
25 
26   PetscFunctionBegin;
27   PetscCall(MatGetSize(A, &m, &n));
28   for (i = 0; i < n; i++) reductions[i] = 0.0;
29   if (type == NORM_2) {
30     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
31       for (jb = 0; jb < bs; jb++) {
32         for (ib = 0; ib < bs; ib++) {
33           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val * *a_val);
34           a_val++;
35         }
36       }
37     }
38   } else if (type == NORM_1) {
39     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
40       for (jb = 0; jb < bs; jb++) {
41         for (ib = 0; ib < bs; ib++) {
42           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val);
43           a_val++;
44         }
45       }
46     }
47   } else if (type == NORM_INFINITY) {
48     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
49       for (jb = 0; jb < bs; jb++) {
50         for (ib = 0; ib < bs; ib++) {
51           int col         = A->cmap->rstart + a_aij->j[i] * bs + jb;
52           reductions[col] = PetscMax(PetscAbsScalar(*a_val), reductions[col]);
53           a_val++;
54         }
55       }
56     }
57   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
58     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
59       for (jb = 0; jb < bs; jb++) {
60         for (ib = 0; ib < bs; ib++) {
61           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscRealPart(*a_val);
62           a_val++;
63         }
64       }
65     }
66   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
67     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
68       for (jb = 0; jb < bs; jb++) {
69         for (ib = 0; ib < bs; ib++) {
70           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscImaginaryPart(*a_val);
71           a_val++;
72         }
73       }
74     }
75   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
76   if (type == NORM_2) {
77     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
78   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
79     for (i = 0; i < n; i++) reductions[i] /= m;
80   }
81   PetscFunctionReturn(0);
82 }
83 
84 PetscErrorCode MatInvertBlockDiagonal_SeqBAIJ(Mat A, const PetscScalar **values) {
85   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
86   PetscInt    *diag_offset, i, bs = A->rmap->bs, mbs = a->mbs, ipvt[5], bs2 = bs * bs, *v_pivots;
87   MatScalar   *v     = a->a, *odiag, *diag, work[25], *v_work;
88   PetscReal    shift = 0.0;
89   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
90 
91   PetscFunctionBegin;
92   allowzeropivot = PetscNot(A->erroriffailure);
93 
94   if (a->idiagvalid) {
95     if (values) *values = a->idiag;
96     PetscFunctionReturn(0);
97   }
98   PetscCall(MatMarkDiagonal_SeqBAIJ(A));
99   diag_offset = a->diag;
100   if (!a->idiag) { PetscCall(PetscMalloc1(bs2 * mbs, &a->idiag)); }
101   diag = a->idiag;
102   if (values) *values = a->idiag;
103   /* factor and invert each block */
104   switch (bs) {
105   case 1:
106     for (i = 0; i < mbs; i++) {
107       odiag   = v + 1 * diag_offset[i];
108       diag[0] = odiag[0];
109 
110       if (PetscAbsScalar(diag[0] + shift) < PETSC_MACHINE_EPSILON) {
111         if (allowzeropivot) {
112           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
113           A->factorerror_zeropivot_value = PetscAbsScalar(diag[0]);
114           A->factorerror_zeropivot_row   = i;
115           PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", i));
116         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot, row %" PetscInt_FMT " pivot value %g tolerance %g", i, (double)PetscAbsScalar(diag[0]), (double)PETSC_MACHINE_EPSILON);
117       }
118 
119       diag[0] = (PetscScalar)1.0 / (diag[0] + shift);
120       diag += 1;
121     }
122     break;
123   case 2:
124     for (i = 0; i < mbs; i++) {
125       odiag   = v + 4 * diag_offset[i];
126       diag[0] = odiag[0];
127       diag[1] = odiag[1];
128       diag[2] = odiag[2];
129       diag[3] = odiag[3];
130       PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
131       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
132       diag += 4;
133     }
134     break;
135   case 3:
136     for (i = 0; i < mbs; i++) {
137       odiag   = v + 9 * diag_offset[i];
138       diag[0] = odiag[0];
139       diag[1] = odiag[1];
140       diag[2] = odiag[2];
141       diag[3] = odiag[3];
142       diag[4] = odiag[4];
143       diag[5] = odiag[5];
144       diag[6] = odiag[6];
145       diag[7] = odiag[7];
146       diag[8] = odiag[8];
147       PetscCall(PetscKernel_A_gets_inverse_A_3(diag, shift, allowzeropivot, &zeropivotdetected));
148       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
149       diag += 9;
150     }
151     break;
152   case 4:
153     for (i = 0; i < mbs; i++) {
154       odiag = v + 16 * diag_offset[i];
155       PetscCall(PetscArraycpy(diag, odiag, 16));
156       PetscCall(PetscKernel_A_gets_inverse_A_4(diag, shift, allowzeropivot, &zeropivotdetected));
157       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
158       diag += 16;
159     }
160     break;
161   case 5:
162     for (i = 0; i < mbs; i++) {
163       odiag = v + 25 * diag_offset[i];
164       PetscCall(PetscArraycpy(diag, odiag, 25));
165       PetscCall(PetscKernel_A_gets_inverse_A_5(diag, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
166       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
167       diag += 25;
168     }
169     break;
170   case 6:
171     for (i = 0; i < mbs; i++) {
172       odiag = v + 36 * diag_offset[i];
173       PetscCall(PetscArraycpy(diag, odiag, 36));
174       PetscCall(PetscKernel_A_gets_inverse_A_6(diag, shift, allowzeropivot, &zeropivotdetected));
175       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
176       diag += 36;
177     }
178     break;
179   case 7:
180     for (i = 0; i < mbs; i++) {
181       odiag = v + 49 * diag_offset[i];
182       PetscCall(PetscArraycpy(diag, odiag, 49));
183       PetscCall(PetscKernel_A_gets_inverse_A_7(diag, shift, allowzeropivot, &zeropivotdetected));
184       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
185       diag += 49;
186     }
187     break;
188   default:
189     PetscCall(PetscMalloc2(bs, &v_work, bs, &v_pivots));
190     for (i = 0; i < mbs; i++) {
191       odiag = v + bs2 * diag_offset[i];
192       PetscCall(PetscArraycpy(diag, odiag, bs2));
193       PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
194       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
195       diag += bs2;
196     }
197     PetscCall(PetscFree2(v_work, v_pivots));
198   }
199   a->idiagvalid = PETSC_TRUE;
200   PetscFunctionReturn(0);
201 }
202 
203 PetscErrorCode MatSOR_SeqBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) {
204   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
205   PetscScalar       *x, *work, *w, *workt, *t;
206   const MatScalar   *v, *aa = a->a, *idiag;
207   const PetscScalar *b, *xb;
208   PetscScalar        s[7], xw[7] = {0}; /* avoid some compilers thinking xw is uninitialized */
209   PetscInt           m = a->mbs, i, i2, nz, bs = A->rmap->bs, bs2 = bs * bs, k, j, idx, it;
210   const PetscInt    *diag, *ai = a->i, *aj = a->j, *vi;
211 
212   PetscFunctionBegin;
213   its = its * lits;
214   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
215   PetscCheck(its > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Relaxation requires global its %" PetscInt_FMT " and local its %" PetscInt_FMT " both positive", its, lits);
216   PetscCheck(!fshift, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for diagonal shift");
217   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for non-trivial relaxation factor");
218   PetscCheck(!(flag & SOR_APPLY_UPPER) && !(flag & SOR_APPLY_LOWER), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for applying upper or lower triangular parts");
219 
220   if (!a->idiagvalid) PetscCall(MatInvertBlockDiagonal(A, NULL));
221 
222   if (!m) PetscFunctionReturn(0);
223   diag  = a->diag;
224   idiag = a->idiag;
225   k     = PetscMax(A->rmap->n, A->cmap->n);
226   if (!a->mult_work) PetscCall(PetscMalloc1(k + 1, &a->mult_work));
227   if (!a->sor_workt) PetscCall(PetscMalloc1(k, &a->sor_workt));
228   if (!a->sor_work) PetscCall(PetscMalloc1(bs, &a->sor_work));
229   work = a->mult_work;
230   t    = a->sor_workt;
231   w    = a->sor_work;
232 
233   PetscCall(VecGetArray(xx, &x));
234   PetscCall(VecGetArrayRead(bb, &b));
235 
236   if (flag & SOR_ZERO_INITIAL_GUESS) {
237     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
238       switch (bs) {
239       case 1:
240         PetscKernel_v_gets_A_times_w_1(x, idiag, b);
241         t[0] = b[0];
242         i2   = 1;
243         idiag += 1;
244         for (i = 1; i < m; i++) {
245           v    = aa + ai[i];
246           vi   = aj + ai[i];
247           nz   = diag[i] - ai[i];
248           s[0] = b[i2];
249           for (j = 0; j < nz; j++) {
250             xw[0] = x[vi[j]];
251             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
252           }
253           t[i2] = s[0];
254           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
255           x[i2] = xw[0];
256           idiag += 1;
257           i2 += 1;
258         }
259         break;
260       case 2:
261         PetscKernel_v_gets_A_times_w_2(x, idiag, b);
262         t[0] = b[0];
263         t[1] = b[1];
264         i2   = 2;
265         idiag += 4;
266         for (i = 1; i < m; i++) {
267           v    = aa + 4 * ai[i];
268           vi   = aj + ai[i];
269           nz   = diag[i] - ai[i];
270           s[0] = b[i2];
271           s[1] = b[i2 + 1];
272           for (j = 0; j < nz; j++) {
273             idx   = 2 * vi[j];
274             it    = 4 * j;
275             xw[0] = x[idx];
276             xw[1] = x[1 + idx];
277             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
278           }
279           t[i2]     = s[0];
280           t[i2 + 1] = s[1];
281           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
282           x[i2]     = xw[0];
283           x[i2 + 1] = xw[1];
284           idiag += 4;
285           i2 += 2;
286         }
287         break;
288       case 3:
289         PetscKernel_v_gets_A_times_w_3(x, idiag, b);
290         t[0] = b[0];
291         t[1] = b[1];
292         t[2] = b[2];
293         i2   = 3;
294         idiag += 9;
295         for (i = 1; i < m; i++) {
296           v    = aa + 9 * ai[i];
297           vi   = aj + ai[i];
298           nz   = diag[i] - ai[i];
299           s[0] = b[i2];
300           s[1] = b[i2 + 1];
301           s[2] = b[i2 + 2];
302           while (nz--) {
303             idx   = 3 * (*vi++);
304             xw[0] = x[idx];
305             xw[1] = x[1 + idx];
306             xw[2] = x[2 + idx];
307             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
308             v += 9;
309           }
310           t[i2]     = s[0];
311           t[i2 + 1] = s[1];
312           t[i2 + 2] = s[2];
313           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
314           x[i2]     = xw[0];
315           x[i2 + 1] = xw[1];
316           x[i2 + 2] = xw[2];
317           idiag += 9;
318           i2 += 3;
319         }
320         break;
321       case 4:
322         PetscKernel_v_gets_A_times_w_4(x, idiag, b);
323         t[0] = b[0];
324         t[1] = b[1];
325         t[2] = b[2];
326         t[3] = b[3];
327         i2   = 4;
328         idiag += 16;
329         for (i = 1; i < m; i++) {
330           v    = aa + 16 * ai[i];
331           vi   = aj + ai[i];
332           nz   = diag[i] - ai[i];
333           s[0] = b[i2];
334           s[1] = b[i2 + 1];
335           s[2] = b[i2 + 2];
336           s[3] = b[i2 + 3];
337           while (nz--) {
338             idx   = 4 * (*vi++);
339             xw[0] = x[idx];
340             xw[1] = x[1 + idx];
341             xw[2] = x[2 + idx];
342             xw[3] = x[3 + idx];
343             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
344             v += 16;
345           }
346           t[i2]     = s[0];
347           t[i2 + 1] = s[1];
348           t[i2 + 2] = s[2];
349           t[i2 + 3] = s[3];
350           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
351           x[i2]     = xw[0];
352           x[i2 + 1] = xw[1];
353           x[i2 + 2] = xw[2];
354           x[i2 + 3] = xw[3];
355           idiag += 16;
356           i2 += 4;
357         }
358         break;
359       case 5:
360         PetscKernel_v_gets_A_times_w_5(x, idiag, b);
361         t[0] = b[0];
362         t[1] = b[1];
363         t[2] = b[2];
364         t[3] = b[3];
365         t[4] = b[4];
366         i2   = 5;
367         idiag += 25;
368         for (i = 1; i < m; i++) {
369           v    = aa + 25 * ai[i];
370           vi   = aj + ai[i];
371           nz   = diag[i] - ai[i];
372           s[0] = b[i2];
373           s[1] = b[i2 + 1];
374           s[2] = b[i2 + 2];
375           s[3] = b[i2 + 3];
376           s[4] = b[i2 + 4];
377           while (nz--) {
378             idx   = 5 * (*vi++);
379             xw[0] = x[idx];
380             xw[1] = x[1 + idx];
381             xw[2] = x[2 + idx];
382             xw[3] = x[3 + idx];
383             xw[4] = x[4 + idx];
384             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
385             v += 25;
386           }
387           t[i2]     = s[0];
388           t[i2 + 1] = s[1];
389           t[i2 + 2] = s[2];
390           t[i2 + 3] = s[3];
391           t[i2 + 4] = s[4];
392           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
393           x[i2]     = xw[0];
394           x[i2 + 1] = xw[1];
395           x[i2 + 2] = xw[2];
396           x[i2 + 3] = xw[3];
397           x[i2 + 4] = xw[4];
398           idiag += 25;
399           i2 += 5;
400         }
401         break;
402       case 6:
403         PetscKernel_v_gets_A_times_w_6(x, idiag, b);
404         t[0] = b[0];
405         t[1] = b[1];
406         t[2] = b[2];
407         t[3] = b[3];
408         t[4] = b[4];
409         t[5] = b[5];
410         i2   = 6;
411         idiag += 36;
412         for (i = 1; i < m; i++) {
413           v    = aa + 36 * ai[i];
414           vi   = aj + ai[i];
415           nz   = diag[i] - ai[i];
416           s[0] = b[i2];
417           s[1] = b[i2 + 1];
418           s[2] = b[i2 + 2];
419           s[3] = b[i2 + 3];
420           s[4] = b[i2 + 4];
421           s[5] = b[i2 + 5];
422           while (nz--) {
423             idx   = 6 * (*vi++);
424             xw[0] = x[idx];
425             xw[1] = x[1 + idx];
426             xw[2] = x[2 + idx];
427             xw[3] = x[3 + idx];
428             xw[4] = x[4 + idx];
429             xw[5] = x[5 + idx];
430             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
431             v += 36;
432           }
433           t[i2]     = s[0];
434           t[i2 + 1] = s[1];
435           t[i2 + 2] = s[2];
436           t[i2 + 3] = s[3];
437           t[i2 + 4] = s[4];
438           t[i2 + 5] = s[5];
439           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
440           x[i2]     = xw[0];
441           x[i2 + 1] = xw[1];
442           x[i2 + 2] = xw[2];
443           x[i2 + 3] = xw[3];
444           x[i2 + 4] = xw[4];
445           x[i2 + 5] = xw[5];
446           idiag += 36;
447           i2 += 6;
448         }
449         break;
450       case 7:
451         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
452         t[0] = b[0];
453         t[1] = b[1];
454         t[2] = b[2];
455         t[3] = b[3];
456         t[4] = b[4];
457         t[5] = b[5];
458         t[6] = b[6];
459         i2   = 7;
460         idiag += 49;
461         for (i = 1; i < m; i++) {
462           v    = aa + 49 * ai[i];
463           vi   = aj + ai[i];
464           nz   = diag[i] - ai[i];
465           s[0] = b[i2];
466           s[1] = b[i2 + 1];
467           s[2] = b[i2 + 2];
468           s[3] = b[i2 + 3];
469           s[4] = b[i2 + 4];
470           s[5] = b[i2 + 5];
471           s[6] = b[i2 + 6];
472           while (nz--) {
473             idx   = 7 * (*vi++);
474             xw[0] = x[idx];
475             xw[1] = x[1 + idx];
476             xw[2] = x[2 + idx];
477             xw[3] = x[3 + idx];
478             xw[4] = x[4 + idx];
479             xw[5] = x[5 + idx];
480             xw[6] = x[6 + idx];
481             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
482             v += 49;
483           }
484           t[i2]     = s[0];
485           t[i2 + 1] = s[1];
486           t[i2 + 2] = s[2];
487           t[i2 + 3] = s[3];
488           t[i2 + 4] = s[4];
489           t[i2 + 5] = s[5];
490           t[i2 + 6] = s[6];
491           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
492           x[i2]     = xw[0];
493           x[i2 + 1] = xw[1];
494           x[i2 + 2] = xw[2];
495           x[i2 + 3] = xw[3];
496           x[i2 + 4] = xw[4];
497           x[i2 + 5] = xw[5];
498           x[i2 + 6] = xw[6];
499           idiag += 49;
500           i2 += 7;
501         }
502         break;
503       default:
504         PetscKernel_w_gets_Ar_times_v(bs, bs, b, idiag, x);
505         PetscCall(PetscArraycpy(t, b, bs));
506         i2 = bs;
507         idiag += bs2;
508         for (i = 1; i < m; i++) {
509           v  = aa + bs2 * ai[i];
510           vi = aj + ai[i];
511           nz = diag[i] - ai[i];
512 
513           PetscCall(PetscArraycpy(w, b + i2, bs));
514           /* copy all rows of x that are needed into contiguous space */
515           workt = work;
516           for (j = 0; j < nz; j++) {
517             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
518             workt += bs;
519           }
520           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
521           PetscCall(PetscArraycpy(t + i2, w, bs));
522           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
523 
524           idiag += bs2;
525           i2 += bs;
526         }
527         break;
528       }
529       /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
530       PetscCall(PetscLogFlops(1.0 * bs2 * a->nz));
531       xb = t;
532     } else xb = b;
533     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
534       idiag = a->idiag + bs2 * (a->mbs - 1);
535       i2    = bs * (m - 1);
536       switch (bs) {
537       case 1:
538         s[0] = xb[i2];
539         PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
540         x[i2] = xw[0];
541         i2 -= 1;
542         for (i = m - 2; i >= 0; i--) {
543           v    = aa + (diag[i] + 1);
544           vi   = aj + diag[i] + 1;
545           nz   = ai[i + 1] - diag[i] - 1;
546           s[0] = xb[i2];
547           for (j = 0; j < nz; j++) {
548             xw[0] = x[vi[j]];
549             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
550           }
551           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
552           x[i2] = xw[0];
553           idiag -= 1;
554           i2 -= 1;
555         }
556         break;
557       case 2:
558         s[0] = xb[i2];
559         s[1] = xb[i2 + 1];
560         PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
561         x[i2]     = xw[0];
562         x[i2 + 1] = xw[1];
563         i2 -= 2;
564         idiag -= 4;
565         for (i = m - 2; i >= 0; i--) {
566           v    = aa + 4 * (diag[i] + 1);
567           vi   = aj + diag[i] + 1;
568           nz   = ai[i + 1] - diag[i] - 1;
569           s[0] = xb[i2];
570           s[1] = xb[i2 + 1];
571           for (j = 0; j < nz; j++) {
572             idx   = 2 * vi[j];
573             it    = 4 * j;
574             xw[0] = x[idx];
575             xw[1] = x[1 + idx];
576             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
577           }
578           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
579           x[i2]     = xw[0];
580           x[i2 + 1] = xw[1];
581           idiag -= 4;
582           i2 -= 2;
583         }
584         break;
585       case 3:
586         s[0] = xb[i2];
587         s[1] = xb[i2 + 1];
588         s[2] = xb[i2 + 2];
589         PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
590         x[i2]     = xw[0];
591         x[i2 + 1] = xw[1];
592         x[i2 + 2] = xw[2];
593         i2 -= 3;
594         idiag -= 9;
595         for (i = m - 2; i >= 0; i--) {
596           v    = aa + 9 * (diag[i] + 1);
597           vi   = aj + diag[i] + 1;
598           nz   = ai[i + 1] - diag[i] - 1;
599           s[0] = xb[i2];
600           s[1] = xb[i2 + 1];
601           s[2] = xb[i2 + 2];
602           while (nz--) {
603             idx   = 3 * (*vi++);
604             xw[0] = x[idx];
605             xw[1] = x[1 + idx];
606             xw[2] = x[2 + idx];
607             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
608             v += 9;
609           }
610           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
611           x[i2]     = xw[0];
612           x[i2 + 1] = xw[1];
613           x[i2 + 2] = xw[2];
614           idiag -= 9;
615           i2 -= 3;
616         }
617         break;
618       case 4:
619         s[0] = xb[i2];
620         s[1] = xb[i2 + 1];
621         s[2] = xb[i2 + 2];
622         s[3] = xb[i2 + 3];
623         PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
624         x[i2]     = xw[0];
625         x[i2 + 1] = xw[1];
626         x[i2 + 2] = xw[2];
627         x[i2 + 3] = xw[3];
628         i2 -= 4;
629         idiag -= 16;
630         for (i = m - 2; i >= 0; i--) {
631           v    = aa + 16 * (diag[i] + 1);
632           vi   = aj + diag[i] + 1;
633           nz   = ai[i + 1] - diag[i] - 1;
634           s[0] = xb[i2];
635           s[1] = xb[i2 + 1];
636           s[2] = xb[i2 + 2];
637           s[3] = xb[i2 + 3];
638           while (nz--) {
639             idx   = 4 * (*vi++);
640             xw[0] = x[idx];
641             xw[1] = x[1 + idx];
642             xw[2] = x[2 + idx];
643             xw[3] = x[3 + idx];
644             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
645             v += 16;
646           }
647           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
648           x[i2]     = xw[0];
649           x[i2 + 1] = xw[1];
650           x[i2 + 2] = xw[2];
651           x[i2 + 3] = xw[3];
652           idiag -= 16;
653           i2 -= 4;
654         }
655         break;
656       case 5:
657         s[0] = xb[i2];
658         s[1] = xb[i2 + 1];
659         s[2] = xb[i2 + 2];
660         s[3] = xb[i2 + 3];
661         s[4] = xb[i2 + 4];
662         PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
663         x[i2]     = xw[0];
664         x[i2 + 1] = xw[1];
665         x[i2 + 2] = xw[2];
666         x[i2 + 3] = xw[3];
667         x[i2 + 4] = xw[4];
668         i2 -= 5;
669         idiag -= 25;
670         for (i = m - 2; i >= 0; i--) {
671           v    = aa + 25 * (diag[i] + 1);
672           vi   = aj + diag[i] + 1;
673           nz   = ai[i + 1] - diag[i] - 1;
674           s[0] = xb[i2];
675           s[1] = xb[i2 + 1];
676           s[2] = xb[i2 + 2];
677           s[3] = xb[i2 + 3];
678           s[4] = xb[i2 + 4];
679           while (nz--) {
680             idx   = 5 * (*vi++);
681             xw[0] = x[idx];
682             xw[1] = x[1 + idx];
683             xw[2] = x[2 + idx];
684             xw[3] = x[3 + idx];
685             xw[4] = x[4 + idx];
686             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
687             v += 25;
688           }
689           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
690           x[i2]     = xw[0];
691           x[i2 + 1] = xw[1];
692           x[i2 + 2] = xw[2];
693           x[i2 + 3] = xw[3];
694           x[i2 + 4] = xw[4];
695           idiag -= 25;
696           i2 -= 5;
697         }
698         break;
699       case 6:
700         s[0] = xb[i2];
701         s[1] = xb[i2 + 1];
702         s[2] = xb[i2 + 2];
703         s[3] = xb[i2 + 3];
704         s[4] = xb[i2 + 4];
705         s[5] = xb[i2 + 5];
706         PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
707         x[i2]     = xw[0];
708         x[i2 + 1] = xw[1];
709         x[i2 + 2] = xw[2];
710         x[i2 + 3] = xw[3];
711         x[i2 + 4] = xw[4];
712         x[i2 + 5] = xw[5];
713         i2 -= 6;
714         idiag -= 36;
715         for (i = m - 2; i >= 0; i--) {
716           v    = aa + 36 * (diag[i] + 1);
717           vi   = aj + diag[i] + 1;
718           nz   = ai[i + 1] - diag[i] - 1;
719           s[0] = xb[i2];
720           s[1] = xb[i2 + 1];
721           s[2] = xb[i2 + 2];
722           s[3] = xb[i2 + 3];
723           s[4] = xb[i2 + 4];
724           s[5] = xb[i2 + 5];
725           while (nz--) {
726             idx   = 6 * (*vi++);
727             xw[0] = x[idx];
728             xw[1] = x[1 + idx];
729             xw[2] = x[2 + idx];
730             xw[3] = x[3 + idx];
731             xw[4] = x[4 + idx];
732             xw[5] = x[5 + idx];
733             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
734             v += 36;
735           }
736           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
737           x[i2]     = xw[0];
738           x[i2 + 1] = xw[1];
739           x[i2 + 2] = xw[2];
740           x[i2 + 3] = xw[3];
741           x[i2 + 4] = xw[4];
742           x[i2 + 5] = xw[5];
743           idiag -= 36;
744           i2 -= 6;
745         }
746         break;
747       case 7:
748         s[0] = xb[i2];
749         s[1] = xb[i2 + 1];
750         s[2] = xb[i2 + 2];
751         s[3] = xb[i2 + 3];
752         s[4] = xb[i2 + 4];
753         s[5] = xb[i2 + 5];
754         s[6] = xb[i2 + 6];
755         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
756         x[i2]     = xw[0];
757         x[i2 + 1] = xw[1];
758         x[i2 + 2] = xw[2];
759         x[i2 + 3] = xw[3];
760         x[i2 + 4] = xw[4];
761         x[i2 + 5] = xw[5];
762         x[i2 + 6] = xw[6];
763         i2 -= 7;
764         idiag -= 49;
765         for (i = m - 2; i >= 0; i--) {
766           v    = aa + 49 * (diag[i] + 1);
767           vi   = aj + diag[i] + 1;
768           nz   = ai[i + 1] - diag[i] - 1;
769           s[0] = xb[i2];
770           s[1] = xb[i2 + 1];
771           s[2] = xb[i2 + 2];
772           s[3] = xb[i2 + 3];
773           s[4] = xb[i2 + 4];
774           s[5] = xb[i2 + 5];
775           s[6] = xb[i2 + 6];
776           while (nz--) {
777             idx   = 7 * (*vi++);
778             xw[0] = x[idx];
779             xw[1] = x[1 + idx];
780             xw[2] = x[2 + idx];
781             xw[3] = x[3 + idx];
782             xw[4] = x[4 + idx];
783             xw[5] = x[5 + idx];
784             xw[6] = x[6 + idx];
785             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
786             v += 49;
787           }
788           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
789           x[i2]     = xw[0];
790           x[i2 + 1] = xw[1];
791           x[i2 + 2] = xw[2];
792           x[i2 + 3] = xw[3];
793           x[i2 + 4] = xw[4];
794           x[i2 + 5] = xw[5];
795           x[i2 + 6] = xw[6];
796           idiag -= 49;
797           i2 -= 7;
798         }
799         break;
800       default:
801         PetscCall(PetscArraycpy(w, xb + i2, bs));
802         PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
803         i2 -= bs;
804         idiag -= bs2;
805         for (i = m - 2; i >= 0; i--) {
806           v  = aa + bs2 * (diag[i] + 1);
807           vi = aj + diag[i] + 1;
808           nz = ai[i + 1] - diag[i] - 1;
809 
810           PetscCall(PetscArraycpy(w, xb + i2, bs));
811           /* copy all rows of x that are needed into contiguous space */
812           workt = work;
813           for (j = 0; j < nz; j++) {
814             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
815             workt += bs;
816           }
817           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
818           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
819 
820           idiag -= bs2;
821           i2 -= bs;
822         }
823         break;
824       }
825       PetscCall(PetscLogFlops(1.0 * bs2 * (a->nz)));
826     }
827     its--;
828   }
829   while (its--) {
830     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
831       idiag = a->idiag;
832       i2    = 0;
833       switch (bs) {
834       case 1:
835         for (i = 0; i < m; i++) {
836           v    = aa + ai[i];
837           vi   = aj + ai[i];
838           nz   = ai[i + 1] - ai[i];
839           s[0] = b[i2];
840           for (j = 0; j < nz; j++) {
841             xw[0] = x[vi[j]];
842             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
843           }
844           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
845           x[i2] += xw[0];
846           idiag += 1;
847           i2 += 1;
848         }
849         break;
850       case 2:
851         for (i = 0; i < m; i++) {
852           v    = aa + 4 * ai[i];
853           vi   = aj + ai[i];
854           nz   = ai[i + 1] - ai[i];
855           s[0] = b[i2];
856           s[1] = b[i2 + 1];
857           for (j = 0; j < nz; j++) {
858             idx   = 2 * vi[j];
859             it    = 4 * j;
860             xw[0] = x[idx];
861             xw[1] = x[1 + idx];
862             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
863           }
864           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
865           x[i2] += xw[0];
866           x[i2 + 1] += xw[1];
867           idiag += 4;
868           i2 += 2;
869         }
870         break;
871       case 3:
872         for (i = 0; i < m; i++) {
873           v    = aa + 9 * ai[i];
874           vi   = aj + ai[i];
875           nz   = ai[i + 1] - ai[i];
876           s[0] = b[i2];
877           s[1] = b[i2 + 1];
878           s[2] = b[i2 + 2];
879           while (nz--) {
880             idx   = 3 * (*vi++);
881             xw[0] = x[idx];
882             xw[1] = x[1 + idx];
883             xw[2] = x[2 + idx];
884             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
885             v += 9;
886           }
887           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
888           x[i2] += xw[0];
889           x[i2 + 1] += xw[1];
890           x[i2 + 2] += xw[2];
891           idiag += 9;
892           i2 += 3;
893         }
894         break;
895       case 4:
896         for (i = 0; i < m; i++) {
897           v    = aa + 16 * ai[i];
898           vi   = aj + ai[i];
899           nz   = ai[i + 1] - ai[i];
900           s[0] = b[i2];
901           s[1] = b[i2 + 1];
902           s[2] = b[i2 + 2];
903           s[3] = b[i2 + 3];
904           while (nz--) {
905             idx   = 4 * (*vi++);
906             xw[0] = x[idx];
907             xw[1] = x[1 + idx];
908             xw[2] = x[2 + idx];
909             xw[3] = x[3 + idx];
910             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
911             v += 16;
912           }
913           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
914           x[i2] += xw[0];
915           x[i2 + 1] += xw[1];
916           x[i2 + 2] += xw[2];
917           x[i2 + 3] += xw[3];
918           idiag += 16;
919           i2 += 4;
920         }
921         break;
922       case 5:
923         for (i = 0; i < m; i++) {
924           v    = aa + 25 * ai[i];
925           vi   = aj + ai[i];
926           nz   = ai[i + 1] - ai[i];
927           s[0] = b[i2];
928           s[1] = b[i2 + 1];
929           s[2] = b[i2 + 2];
930           s[3] = b[i2 + 3];
931           s[4] = b[i2 + 4];
932           while (nz--) {
933             idx   = 5 * (*vi++);
934             xw[0] = x[idx];
935             xw[1] = x[1 + idx];
936             xw[2] = x[2 + idx];
937             xw[3] = x[3 + idx];
938             xw[4] = x[4 + idx];
939             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
940             v += 25;
941           }
942           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
943           x[i2] += xw[0];
944           x[i2 + 1] += xw[1];
945           x[i2 + 2] += xw[2];
946           x[i2 + 3] += xw[3];
947           x[i2 + 4] += xw[4];
948           idiag += 25;
949           i2 += 5;
950         }
951         break;
952       case 6:
953         for (i = 0; i < m; i++) {
954           v    = aa + 36 * ai[i];
955           vi   = aj + ai[i];
956           nz   = ai[i + 1] - ai[i];
957           s[0] = b[i2];
958           s[1] = b[i2 + 1];
959           s[2] = b[i2 + 2];
960           s[3] = b[i2 + 3];
961           s[4] = b[i2 + 4];
962           s[5] = b[i2 + 5];
963           while (nz--) {
964             idx   = 6 * (*vi++);
965             xw[0] = x[idx];
966             xw[1] = x[1 + idx];
967             xw[2] = x[2 + idx];
968             xw[3] = x[3 + idx];
969             xw[4] = x[4 + idx];
970             xw[5] = x[5 + idx];
971             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
972             v += 36;
973           }
974           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
975           x[i2] += xw[0];
976           x[i2 + 1] += xw[1];
977           x[i2 + 2] += xw[2];
978           x[i2 + 3] += xw[3];
979           x[i2 + 4] += xw[4];
980           x[i2 + 5] += xw[5];
981           idiag += 36;
982           i2 += 6;
983         }
984         break;
985       case 7:
986         for (i = 0; i < m; i++) {
987           v    = aa + 49 * ai[i];
988           vi   = aj + ai[i];
989           nz   = ai[i + 1] - ai[i];
990           s[0] = b[i2];
991           s[1] = b[i2 + 1];
992           s[2] = b[i2 + 2];
993           s[3] = b[i2 + 3];
994           s[4] = b[i2 + 4];
995           s[5] = b[i2 + 5];
996           s[6] = b[i2 + 6];
997           while (nz--) {
998             idx   = 7 * (*vi++);
999             xw[0] = x[idx];
1000             xw[1] = x[1 + idx];
1001             xw[2] = x[2 + idx];
1002             xw[3] = x[3 + idx];
1003             xw[4] = x[4 + idx];
1004             xw[5] = x[5 + idx];
1005             xw[6] = x[6 + idx];
1006             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1007             v += 49;
1008           }
1009           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1010           x[i2] += xw[0];
1011           x[i2 + 1] += xw[1];
1012           x[i2 + 2] += xw[2];
1013           x[i2 + 3] += xw[3];
1014           x[i2 + 4] += xw[4];
1015           x[i2 + 5] += xw[5];
1016           x[i2 + 6] += xw[6];
1017           idiag += 49;
1018           i2 += 7;
1019         }
1020         break;
1021       default:
1022         for (i = 0; i < m; i++) {
1023           v  = aa + bs2 * ai[i];
1024           vi = aj + ai[i];
1025           nz = ai[i + 1] - ai[i];
1026 
1027           PetscCall(PetscArraycpy(w, b + i2, bs));
1028           /* copy all rows of x that are needed into contiguous space */
1029           workt = work;
1030           for (j = 0; j < nz; j++) {
1031             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
1032             workt += bs;
1033           }
1034           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1035           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);
1036 
1037           idiag += bs2;
1038           i2 += bs;
1039         }
1040         break;
1041       }
1042       PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1043     }
1044     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
1045       idiag = a->idiag + bs2 * (a->mbs - 1);
1046       i2    = bs * (m - 1);
1047       switch (bs) {
1048       case 1:
1049         for (i = m - 1; i >= 0; i--) {
1050           v    = aa + ai[i];
1051           vi   = aj + ai[i];
1052           nz   = ai[i + 1] - ai[i];
1053           s[0] = b[i2];
1054           for (j = 0; j < nz; j++) {
1055             xw[0] = x[vi[j]];
1056             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
1057           }
1058           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
1059           x[i2] += xw[0];
1060           idiag -= 1;
1061           i2 -= 1;
1062         }
1063         break;
1064       case 2:
1065         for (i = m - 1; i >= 0; i--) {
1066           v    = aa + 4 * ai[i];
1067           vi   = aj + ai[i];
1068           nz   = ai[i + 1] - ai[i];
1069           s[0] = b[i2];
1070           s[1] = b[i2 + 1];
1071           for (j = 0; j < nz; j++) {
1072             idx   = 2 * vi[j];
1073             it    = 4 * j;
1074             xw[0] = x[idx];
1075             xw[1] = x[1 + idx];
1076             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
1077           }
1078           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
1079           x[i2] += xw[0];
1080           x[i2 + 1] += xw[1];
1081           idiag -= 4;
1082           i2 -= 2;
1083         }
1084         break;
1085       case 3:
1086         for (i = m - 1; i >= 0; i--) {
1087           v    = aa + 9 * ai[i];
1088           vi   = aj + ai[i];
1089           nz   = ai[i + 1] - ai[i];
1090           s[0] = b[i2];
1091           s[1] = b[i2 + 1];
1092           s[2] = b[i2 + 2];
1093           while (nz--) {
1094             idx   = 3 * (*vi++);
1095             xw[0] = x[idx];
1096             xw[1] = x[1 + idx];
1097             xw[2] = x[2 + idx];
1098             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
1099             v += 9;
1100           }
1101           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
1102           x[i2] += xw[0];
1103           x[i2 + 1] += xw[1];
1104           x[i2 + 2] += xw[2];
1105           idiag -= 9;
1106           i2 -= 3;
1107         }
1108         break;
1109       case 4:
1110         for (i = m - 1; i >= 0; i--) {
1111           v    = aa + 16 * ai[i];
1112           vi   = aj + ai[i];
1113           nz   = ai[i + 1] - ai[i];
1114           s[0] = b[i2];
1115           s[1] = b[i2 + 1];
1116           s[2] = b[i2 + 2];
1117           s[3] = b[i2 + 3];
1118           while (nz--) {
1119             idx   = 4 * (*vi++);
1120             xw[0] = x[idx];
1121             xw[1] = x[1 + idx];
1122             xw[2] = x[2 + idx];
1123             xw[3] = x[3 + idx];
1124             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
1125             v += 16;
1126           }
1127           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
1128           x[i2] += xw[0];
1129           x[i2 + 1] += xw[1];
1130           x[i2 + 2] += xw[2];
1131           x[i2 + 3] += xw[3];
1132           idiag -= 16;
1133           i2 -= 4;
1134         }
1135         break;
1136       case 5:
1137         for (i = m - 1; i >= 0; i--) {
1138           v    = aa + 25 * ai[i];
1139           vi   = aj + ai[i];
1140           nz   = ai[i + 1] - ai[i];
1141           s[0] = b[i2];
1142           s[1] = b[i2 + 1];
1143           s[2] = b[i2 + 2];
1144           s[3] = b[i2 + 3];
1145           s[4] = b[i2 + 4];
1146           while (nz--) {
1147             idx   = 5 * (*vi++);
1148             xw[0] = x[idx];
1149             xw[1] = x[1 + idx];
1150             xw[2] = x[2 + idx];
1151             xw[3] = x[3 + idx];
1152             xw[4] = x[4 + idx];
1153             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
1154             v += 25;
1155           }
1156           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
1157           x[i2] += xw[0];
1158           x[i2 + 1] += xw[1];
1159           x[i2 + 2] += xw[2];
1160           x[i2 + 3] += xw[3];
1161           x[i2 + 4] += xw[4];
1162           idiag -= 25;
1163           i2 -= 5;
1164         }
1165         break;
1166       case 6:
1167         for (i = m - 1; i >= 0; i--) {
1168           v    = aa + 36 * ai[i];
1169           vi   = aj + ai[i];
1170           nz   = ai[i + 1] - ai[i];
1171           s[0] = b[i2];
1172           s[1] = b[i2 + 1];
1173           s[2] = b[i2 + 2];
1174           s[3] = b[i2 + 3];
1175           s[4] = b[i2 + 4];
1176           s[5] = b[i2 + 5];
1177           while (nz--) {
1178             idx   = 6 * (*vi++);
1179             xw[0] = x[idx];
1180             xw[1] = x[1 + idx];
1181             xw[2] = x[2 + idx];
1182             xw[3] = x[3 + idx];
1183             xw[4] = x[4 + idx];
1184             xw[5] = x[5 + idx];
1185             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
1186             v += 36;
1187           }
1188           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
1189           x[i2] += xw[0];
1190           x[i2 + 1] += xw[1];
1191           x[i2 + 2] += xw[2];
1192           x[i2 + 3] += xw[3];
1193           x[i2 + 4] += xw[4];
1194           x[i2 + 5] += xw[5];
1195           idiag -= 36;
1196           i2 -= 6;
1197         }
1198         break;
1199       case 7:
1200         for (i = m - 1; i >= 0; i--) {
1201           v    = aa + 49 * ai[i];
1202           vi   = aj + ai[i];
1203           nz   = ai[i + 1] - ai[i];
1204           s[0] = b[i2];
1205           s[1] = b[i2 + 1];
1206           s[2] = b[i2 + 2];
1207           s[3] = b[i2 + 3];
1208           s[4] = b[i2 + 4];
1209           s[5] = b[i2 + 5];
1210           s[6] = b[i2 + 6];
1211           while (nz--) {
1212             idx   = 7 * (*vi++);
1213             xw[0] = x[idx];
1214             xw[1] = x[1 + idx];
1215             xw[2] = x[2 + idx];
1216             xw[3] = x[3 + idx];
1217             xw[4] = x[4 + idx];
1218             xw[5] = x[5 + idx];
1219             xw[6] = x[6 + idx];
1220             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1221             v += 49;
1222           }
1223           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1224           x[i2] += xw[0];
1225           x[i2 + 1] += xw[1];
1226           x[i2 + 2] += xw[2];
1227           x[i2 + 3] += xw[3];
1228           x[i2 + 4] += xw[4];
1229           x[i2 + 5] += xw[5];
1230           x[i2 + 6] += xw[6];
1231           idiag -= 49;
1232           i2 -= 7;
1233         }
1234         break;
1235       default:
1236         for (i = m - 1; i >= 0; i--) {
1237           v  = aa + bs2 * ai[i];
1238           vi = aj + ai[i];
1239           nz = ai[i + 1] - ai[i];
1240 
1241           PetscCall(PetscArraycpy(w, b + i2, bs));
1242           /* copy all rows of x that are needed into contiguous space */
1243           workt = work;
1244           for (j = 0; j < nz; j++) {
1245             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
1246             workt += bs;
1247           }
1248           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1249           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);
1250 
1251           idiag -= bs2;
1252           i2 -= bs;
1253         }
1254         break;
1255       }
1256       PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz)));
1257     }
1258   }
1259   PetscCall(VecRestoreArray(xx, &x));
1260   PetscCall(VecRestoreArrayRead(bb, &b));
1261   PetscFunctionReturn(0);
1262 }
1263 
1264 /*
1265     Special version for direct calls from Fortran (Used in PETSc-fun3d)
1266 */
1267 #if defined(PETSC_HAVE_FORTRAN_CAPS)
1268 #define matsetvaluesblocked4_ MATSETVALUESBLOCKED4
1269 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1270 #define matsetvaluesblocked4_ matsetvaluesblocked4
1271 #endif
1272 
1273 PETSC_EXTERN void matsetvaluesblocked4_(Mat *AA, PetscInt *mm, const PetscInt im[], PetscInt *nn, const PetscInt in[], const PetscScalar v[]) {
1274   Mat                A = *AA;
1275   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1276   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, N, m = *mm, n = *nn;
1277   PetscInt          *ai = a->i, *ailen = a->ilen;
1278   PetscInt          *aj = a->j, stepval, lastcol = -1;
1279   const PetscScalar *value = v;
1280   MatScalar         *ap, *aa = a->a, *bap;
1281 
1282   PetscFunctionBegin;
1283   if (A->rmap->bs != 4) SETERRABORT(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Can only be called with a block size of 4");
1284   stepval = (n - 1) * 4;
1285   for (k = 0; k < m; k++) { /* loop over added rows */
1286     row  = im[k];
1287     rp   = aj + ai[row];
1288     ap   = aa + 16 * ai[row];
1289     nrow = ailen[row];
1290     low  = 0;
1291     high = nrow;
1292     for (l = 0; l < n; l++) { /* loop over added columns */
1293       col = in[l];
1294       if (col <= lastcol) low = 0;
1295       else high = nrow;
1296       lastcol = col;
1297       value   = v + k * (stepval + 4 + l) * 4;
1298       while (high - low > 7) {
1299         t = (low + high) / 2;
1300         if (rp[t] > col) high = t;
1301         else low = t;
1302       }
1303       for (i = low; i < high; i++) {
1304         if (rp[i] > col) break;
1305         if (rp[i] == col) {
1306           bap = ap + 16 * i;
1307           for (ii = 0; ii < 4; ii++, value += stepval) {
1308             for (jj = ii; jj < 16; jj += 4) bap[jj] += *value++;
1309           }
1310           goto noinsert2;
1311         }
1312       }
1313       N = nrow++ - 1;
1314       high++; /* added new column index thus must search to one higher than before */
1315       /* shift up all the later entries in this row */
1316       for (ii = N; ii >= i; ii--) {
1317         rp[ii + 1] = rp[ii];
1318         PetscCallVoid(PetscArraycpy(ap + 16 * (ii + 1), ap + 16 * (ii), 16));
1319       }
1320       if (N >= i) PetscCallVoid(PetscArrayzero(ap + 16 * i, 16));
1321       rp[i] = col;
1322       bap   = ap + 16 * i;
1323       for (ii = 0; ii < 4; ii++, value += stepval) {
1324         for (jj = ii; jj < 16; jj += 4) bap[jj] = *value++;
1325       }
1326     noinsert2:;
1327       low = i;
1328     }
1329     ailen[row] = nrow;
1330   }
1331   PetscFunctionReturnVoid();
1332 }
1333 
1334 #if defined(PETSC_HAVE_FORTRAN_CAPS)
1335 #define matsetvalues4_ MATSETVALUES4
1336 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1337 #define matsetvalues4_ matsetvalues4
1338 #endif
1339 
1340 PETSC_EXTERN void matsetvalues4_(Mat *AA, PetscInt *mm, PetscInt *im, PetscInt *nn, PetscInt *in, PetscScalar *v) {
1341   Mat          A = *AA;
1342   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1343   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, N, n = *nn, m = *mm;
1344   PetscInt    *ai = a->i, *ailen = a->ilen;
1345   PetscInt    *aj = a->j, brow, bcol;
1346   PetscInt     ridx, cidx, lastcol = -1;
1347   MatScalar   *ap, value, *aa      = a->a, *bap;
1348 
1349   PetscFunctionBegin;
1350   for (k = 0; k < m; k++) { /* loop over added rows */
1351     row  = im[k];
1352     brow = row / 4;
1353     rp   = aj + ai[brow];
1354     ap   = aa + 16 * ai[brow];
1355     nrow = ailen[brow];
1356     low  = 0;
1357     high = nrow;
1358     for (l = 0; l < n; l++) { /* loop over added columns */
1359       col   = in[l];
1360       bcol  = col / 4;
1361       ridx  = row % 4;
1362       cidx  = col % 4;
1363       value = v[l + k * n];
1364       if (col <= lastcol) low = 0;
1365       else high = nrow;
1366       lastcol = col;
1367       while (high - low > 7) {
1368         t = (low + high) / 2;
1369         if (rp[t] > bcol) high = t;
1370         else low = t;
1371       }
1372       for (i = low; i < high; i++) {
1373         if (rp[i] > bcol) break;
1374         if (rp[i] == bcol) {
1375           bap = ap + 16 * i + 4 * cidx + ridx;
1376           *bap += value;
1377           goto noinsert1;
1378         }
1379       }
1380       N = nrow++ - 1;
1381       high++; /* added new column thus must search to one higher than before */
1382       /* shift up all the later entries in this row */
1383       PetscCallVoid(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
1384       PetscCallVoid(PetscArraymove(ap + 16 * i + 16, ap + 16 * i, 16 * (N - i + 1)));
1385       PetscCallVoid(PetscArrayzero(ap + 16 * i, 16));
1386       rp[i]                        = bcol;
1387       ap[16 * i + 4 * cidx + ridx] = value;
1388     noinsert1:;
1389       low = i;
1390     }
1391     ailen[brow] = nrow;
1392   }
1393   PetscFunctionReturnVoid();
1394 }
1395 
1396 /*
1397      Checks for missing diagonals
1398 */
1399 PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat A, PetscBool *missing, PetscInt *d) {
1400   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1401   PetscInt    *diag, *ii = a->i, i;
1402 
1403   PetscFunctionBegin;
1404   PetscCall(MatMarkDiagonal_SeqBAIJ(A));
1405   *missing = PETSC_FALSE;
1406   if (A->rmap->n > 0 && !ii) {
1407     *missing = PETSC_TRUE;
1408     if (d) *d = 0;
1409     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
1410   } else {
1411     PetscInt n;
1412     n    = PetscMin(a->mbs, a->nbs);
1413     diag = a->diag;
1414     for (i = 0; i < n; i++) {
1415       if (diag[i] >= ii[i + 1]) {
1416         *missing = PETSC_TRUE;
1417         if (d) *d = i;
1418         PetscCall(PetscInfo(A, "Matrix is missing block diagonal number %" PetscInt_FMT "\n", i));
1419         break;
1420       }
1421     }
1422   }
1423   PetscFunctionReturn(0);
1424 }
1425 
1426 PetscErrorCode MatMarkDiagonal_SeqBAIJ(Mat A) {
1427   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1428   PetscInt     i, j, m = a->mbs;
1429 
1430   PetscFunctionBegin;
1431   if (!a->diag) {
1432     PetscCall(PetscMalloc1(m, &a->diag));
1433     a->free_diag = PETSC_TRUE;
1434   }
1435   for (i = 0; i < m; i++) {
1436     a->diag[i] = a->i[i + 1];
1437     for (j = a->i[i]; j < a->i[i + 1]; j++) {
1438       if (a->j[j] == i) {
1439         a->diag[i] = j;
1440         break;
1441       }
1442     }
1443   }
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 static PetscErrorCode MatGetRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *inia[], const PetscInt *inja[], PetscBool *done) {
1448   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1449   PetscInt     i, j, n = a->mbs, nz = a->i[n], *tia, *tja, bs = A->rmap->bs, k, l, cnt;
1450   PetscInt   **ia = (PetscInt **)inia, **ja = (PetscInt **)inja;
1451 
1452   PetscFunctionBegin;
1453   *nn = n;
1454   if (!ia) PetscFunctionReturn(0);
1455   if (symmetric) {
1456     PetscCall(MatToSymmetricIJ_SeqAIJ(n, a->i, a->j, PETSC_TRUE, 0, 0, &tia, &tja));
1457     nz = tia[n];
1458   } else {
1459     tia = a->i;
1460     tja = a->j;
1461   }
1462 
1463   if (!blockcompressed && bs > 1) {
1464     (*nn) *= bs;
1465     /* malloc & create the natural set of indices */
1466     PetscCall(PetscMalloc1((n + 1) * bs, ia));
1467     if (n) {
1468       (*ia)[0] = oshift;
1469       for (j = 1; j < bs; j++) (*ia)[j] = (tia[1] - tia[0]) * bs + (*ia)[j - 1];
1470     }
1471 
1472     for (i = 1; i < n; i++) {
1473       (*ia)[i * bs] = (tia[i] - tia[i - 1]) * bs + (*ia)[i * bs - 1];
1474       for (j = 1; j < bs; j++) (*ia)[i * bs + j] = (tia[i + 1] - tia[i]) * bs + (*ia)[i * bs + j - 1];
1475     }
1476     if (n) (*ia)[n * bs] = (tia[n] - tia[n - 1]) * bs + (*ia)[n * bs - 1];
1477 
1478     if (inja) {
1479       PetscCall(PetscMalloc1(nz * bs * bs, ja));
1480       cnt = 0;
1481       for (i = 0; i < n; i++) {
1482         for (j = 0; j < bs; j++) {
1483           for (k = tia[i]; k < tia[i + 1]; k++) {
1484             for (l = 0; l < bs; l++) (*ja)[cnt++] = bs * tja[k] + l;
1485           }
1486         }
1487       }
1488     }
1489 
1490     if (symmetric) { /* deallocate memory allocated in MatToSymmetricIJ_SeqAIJ() */
1491       PetscCall(PetscFree(tia));
1492       PetscCall(PetscFree(tja));
1493     }
1494   } else if (oshift == 1) {
1495     if (symmetric) {
1496       nz = tia[A->rmap->n / bs];
1497       /*  add 1 to i and j indices */
1498       for (i = 0; i < A->rmap->n / bs + 1; i++) tia[i] = tia[i] + 1;
1499       *ia = tia;
1500       if (ja) {
1501         for (i = 0; i < nz; i++) tja[i] = tja[i] + 1;
1502         *ja = tja;
1503       }
1504     } else {
1505       nz = a->i[A->rmap->n / bs];
1506       /* malloc space and  add 1 to i and j indices */
1507       PetscCall(PetscMalloc1(A->rmap->n / bs + 1, ia));
1508       for (i = 0; i < A->rmap->n / bs + 1; i++) (*ia)[i] = a->i[i] + 1;
1509       if (ja) {
1510         PetscCall(PetscMalloc1(nz, ja));
1511         for (i = 0; i < nz; i++) (*ja)[i] = a->j[i] + 1;
1512       }
1513     }
1514   } else {
1515     *ia = tia;
1516     if (ja) *ja = tja;
1517   }
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 static PetscErrorCode MatRestoreRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) {
1522   PetscFunctionBegin;
1523   if (!ia) PetscFunctionReturn(0);
1524   if ((!blockcompressed && A->rmap->bs > 1) || (symmetric || oshift == 1)) {
1525     PetscCall(PetscFree(*ia));
1526     if (ja) PetscCall(PetscFree(*ja));
1527   }
1528   PetscFunctionReturn(0);
1529 }
1530 
1531 PetscErrorCode MatDestroy_SeqBAIJ(Mat A) {
1532   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1533 
1534   PetscFunctionBegin;
1535 #if defined(PETSC_USE_LOG)
1536   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->N, A->cmap->n, a->nz);
1537 #endif
1538   PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
1539   PetscCall(ISDestroy(&a->row));
1540   PetscCall(ISDestroy(&a->col));
1541   if (a->free_diag) PetscCall(PetscFree(a->diag));
1542   PetscCall(PetscFree(a->idiag));
1543   if (a->free_imax_ilen) PetscCall(PetscFree2(a->imax, a->ilen));
1544   PetscCall(PetscFree(a->solve_work));
1545   PetscCall(PetscFree(a->mult_work));
1546   PetscCall(PetscFree(a->sor_workt));
1547   PetscCall(PetscFree(a->sor_work));
1548   PetscCall(ISDestroy(&a->icol));
1549   PetscCall(PetscFree(a->saved_values));
1550   PetscCall(PetscFree2(a->compressedrow.i, a->compressedrow.rindex));
1551 
1552   PetscCall(MatDestroy(&a->sbaijMat));
1553   PetscCall(MatDestroy(&a->parent));
1554   PetscCall(PetscFree(A->data));
1555 
1556   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
1557   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJGetArray_C", NULL));
1558   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJRestoreArray_C", NULL));
1559   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
1560   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
1561   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetColumnIndices_C", NULL));
1562   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqaij_C", NULL));
1563   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqsbaij_C", NULL));
1564   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocation_C", NULL));
1565   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocationCSR_C", NULL));
1566   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqbstrm_C", NULL));
1567   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatIsTranspose_C", NULL));
1568 #if defined(PETSC_HAVE_HYPRE)
1569   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_hypre_C", NULL));
1570 #endif
1571   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_is_C", NULL));
1572   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
1573   PetscFunctionReturn(0);
1574 }
1575 
1576 PetscErrorCode MatSetOption_SeqBAIJ(Mat A, MatOption op, PetscBool flg) {
1577   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1578 
1579   PetscFunctionBegin;
1580   switch (op) {
1581   case MAT_ROW_ORIENTED: a->roworiented = flg; break;
1582   case MAT_KEEP_NONZERO_PATTERN: a->keepnonzeropattern = flg; break;
1583   case MAT_NEW_NONZERO_LOCATIONS: a->nonew = (flg ? 0 : 1); break;
1584   case MAT_NEW_NONZERO_LOCATION_ERR: a->nonew = (flg ? -1 : 0); break;
1585   case MAT_NEW_NONZERO_ALLOCATION_ERR: a->nonew = (flg ? -2 : 0); break;
1586   case MAT_UNUSED_NONZERO_LOCATION_ERR: a->nounused = (flg ? -1 : 0); break;
1587   case MAT_FORCE_DIAGONAL_ENTRIES:
1588   case MAT_IGNORE_OFF_PROC_ENTRIES:
1589   case MAT_USE_HASH_TABLE:
1590   case MAT_SORTED_FULL: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); break;
1591   case MAT_SPD:
1592   case MAT_SYMMETRIC:
1593   case MAT_STRUCTURALLY_SYMMETRIC:
1594   case MAT_HERMITIAN:
1595   case MAT_SYMMETRY_ETERNAL:
1596   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1597   case MAT_SUBMAT_SINGLEIS:
1598   case MAT_STRUCTURE_ONLY:
1599   case MAT_SPD_ETERNAL:
1600     /* if the diagonal matrix is square it inherits some of the properties above */
1601     break;
1602   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
1603   }
1604   PetscFunctionReturn(0);
1605 }
1606 
1607 /* used for both SeqBAIJ and SeqSBAIJ matrices */
1608 PetscErrorCode MatGetRow_SeqBAIJ_private(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v, PetscInt *ai, PetscInt *aj, PetscScalar *aa) {
1609   PetscInt     itmp, i, j, k, M, bn, bp, *idx_i, bs, bs2;
1610   MatScalar   *aa_i;
1611   PetscScalar *v_i;
1612 
1613   PetscFunctionBegin;
1614   bs  = A->rmap->bs;
1615   bs2 = bs * bs;
1616   PetscCheck(row >= 0 && row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
1617 
1618   bn  = row / bs; /* Block number */
1619   bp  = row % bs; /* Block Position */
1620   M   = ai[bn + 1] - ai[bn];
1621   *nz = bs * M;
1622 
1623   if (v) {
1624     *v = NULL;
1625     if (*nz) {
1626       PetscCall(PetscMalloc1(*nz, v));
1627       for (i = 0; i < M; i++) { /* for each block in the block row */
1628         v_i  = *v + i * bs;
1629         aa_i = aa + bs2 * (ai[bn] + i);
1630         for (j = bp, k = 0; j < bs2; j += bs, k++) v_i[k] = aa_i[j];
1631       }
1632     }
1633   }
1634 
1635   if (idx) {
1636     *idx = NULL;
1637     if (*nz) {
1638       PetscCall(PetscMalloc1(*nz, idx));
1639       for (i = 0; i < M; i++) { /* for each block in the block row */
1640         idx_i = *idx + i * bs;
1641         itmp  = bs * aj[ai[bn] + i];
1642         for (j = 0; j < bs; j++) idx_i[j] = itmp++;
1643       }
1644     }
1645   }
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 PetscErrorCode MatGetRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) {
1650   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1651 
1652   PetscFunctionBegin;
1653   PetscCall(MatGetRow_SeqBAIJ_private(A, row, nz, idx, v, a->i, a->j, a->a));
1654   PetscFunctionReturn(0);
1655 }
1656 
1657 PetscErrorCode MatRestoreRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) {
1658   PetscFunctionBegin;
1659   if (nz) *nz = 0;
1660   if (idx) PetscCall(PetscFree(*idx));
1661   if (v) PetscCall(PetscFree(*v));
1662   PetscFunctionReturn(0);
1663 }
1664 
1665 PetscErrorCode MatTranspose_SeqBAIJ(Mat A, MatReuse reuse, Mat *B) {
1666   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *at;
1667   Mat          C;
1668   PetscInt     i, j, k, *aj = a->j, *ai = a->i, bs = A->rmap->bs, mbs = a->mbs, nbs = a->nbs, *atfill;
1669   PetscInt     bs2 = a->bs2, *ati, *atj, anzj, kr;
1670   MatScalar   *ata, *aa = a->a;
1671 
1672   PetscFunctionBegin;
1673   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
1674   PetscCall(PetscCalloc1(1 + nbs, &atfill));
1675   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
1676     for (i = 0; i < ai[mbs]; i++) atfill[aj[i]] += 1; /* count num of non-zeros in row aj[i] */
1677 
1678     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1679     PetscCall(MatSetSizes(C, A->cmap->n, A->rmap->N, A->cmap->n, A->rmap->N));
1680     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1681     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, atfill));
1682 
1683     at  = (Mat_SeqBAIJ *)C->data;
1684     ati = at->i;
1685     for (i = 0; i < nbs; i++) at->ilen[i] = at->imax[i] = ati[i + 1] - ati[i];
1686   } else {
1687     C   = *B;
1688     at  = (Mat_SeqBAIJ *)C->data;
1689     ati = at->i;
1690   }
1691 
1692   atj = at->j;
1693   ata = at->a;
1694 
1695   /* Copy ati into atfill so we have locations of the next free space in atj */
1696   PetscCall(PetscArraycpy(atfill, ati, nbs));
1697 
1698   /* Walk through A row-wise and mark nonzero entries of A^T. */
1699   for (i = 0; i < mbs; i++) {
1700     anzj = ai[i + 1] - ai[i];
1701     for (j = 0; j < anzj; j++) {
1702       atj[atfill[*aj]] = i;
1703       for (kr = 0; kr < bs; kr++) {
1704         for (k = 0; k < bs; k++) ata[bs2 * atfill[*aj] + k * bs + kr] = *aa++;
1705       }
1706       atfill[*aj++] += 1;
1707     }
1708   }
1709   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1710   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1711 
1712   /* Clean up temporary space and complete requests. */
1713   PetscCall(PetscFree(atfill));
1714 
1715   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1716     PetscCall(MatSetBlockSizes(C, PetscAbs(A->cmap->bs), PetscAbs(A->rmap->bs)));
1717     *B = C;
1718   } else {
1719     PetscCall(MatHeaderMerge(A, &C));
1720   }
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 PetscErrorCode MatIsTranspose_SeqBAIJ(Mat A, Mat B, PetscReal tol, PetscBool *f) {
1725   Mat Btrans;
1726 
1727   PetscFunctionBegin;
1728   *f = PETSC_FALSE;
1729   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &Btrans));
1730   PetscCall(MatEqual_SeqBAIJ(B, Btrans, f));
1731   PetscCall(MatDestroy(&Btrans));
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 /* Used for both SeqBAIJ and SeqSBAIJ matrices */
1736 PetscErrorCode MatView_SeqBAIJ_Binary(Mat mat, PetscViewer viewer) {
1737   Mat_SeqBAIJ *A = (Mat_SeqBAIJ *)mat->data;
1738   PetscInt     header[4], M, N, m, bs, nz, cnt, i, j, k, l;
1739   PetscInt    *rowlens, *colidxs;
1740   PetscScalar *matvals;
1741 
1742   PetscFunctionBegin;
1743   PetscCall(PetscViewerSetUp(viewer));
1744 
1745   M  = mat->rmap->N;
1746   N  = mat->cmap->N;
1747   m  = mat->rmap->n;
1748   bs = mat->rmap->bs;
1749   nz = bs * bs * A->nz;
1750 
1751   /* write matrix header */
1752   header[0] = MAT_FILE_CLASSID;
1753   header[1] = M;
1754   header[2] = N;
1755   header[3] = nz;
1756   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1757 
1758   /* store row lengths */
1759   PetscCall(PetscMalloc1(m, &rowlens));
1760   for (cnt = 0, i = 0; i < A->mbs; i++)
1761     for (j = 0; j < bs; j++) rowlens[cnt++] = bs * (A->i[i + 1] - A->i[i]);
1762   PetscCall(PetscViewerBinaryWrite(viewer, rowlens, m, PETSC_INT));
1763   PetscCall(PetscFree(rowlens));
1764 
1765   /* store column indices  */
1766   PetscCall(PetscMalloc1(nz, &colidxs));
1767   for (cnt = 0, i = 0; i < A->mbs; i++)
1768     for (k = 0; k < bs; k++)
1769       for (j = A->i[i]; j < A->i[i + 1]; j++)
1770         for (l = 0; l < bs; l++) colidxs[cnt++] = bs * A->j[j] + l;
1771   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz);
1772   PetscCall(PetscViewerBinaryWrite(viewer, colidxs, nz, PETSC_INT));
1773   PetscCall(PetscFree(colidxs));
1774 
1775   /* store nonzero values */
1776   PetscCall(PetscMalloc1(nz, &matvals));
1777   for (cnt = 0, i = 0; i < A->mbs; i++)
1778     for (k = 0; k < bs; k++)
1779       for (j = A->i[i]; j < A->i[i + 1]; j++)
1780         for (l = 0; l < bs; l++) matvals[cnt++] = A->a[bs * (bs * j + l) + k];
1781   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz);
1782   PetscCall(PetscViewerBinaryWrite(viewer, matvals, nz, PETSC_SCALAR));
1783   PetscCall(PetscFree(matvals));
1784 
1785   /* write block size option to the viewer's .info file */
1786   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1787   PetscFunctionReturn(0);
1788 }
1789 
1790 static PetscErrorCode MatView_SeqBAIJ_ASCII_structonly(Mat A, PetscViewer viewer) {
1791   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1792   PetscInt     i, bs = A->rmap->bs, k;
1793 
1794   PetscFunctionBegin;
1795   PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1796   for (i = 0; i < a->mbs; i++) {
1797     PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT "-%" PetscInt_FMT ":", i * bs, i * bs + bs - 1));
1798     for (k = a->i[i]; k < a->i[i + 1]; k++) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT "-%" PetscInt_FMT ") ", bs * a->j[k], bs * a->j[k] + bs - 1));
1799     PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1800   }
1801   PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1802   PetscFunctionReturn(0);
1803 }
1804 
1805 static PetscErrorCode MatView_SeqBAIJ_ASCII(Mat A, PetscViewer viewer) {
1806   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1807   PetscInt          i, j, bs = A->rmap->bs, k, l, bs2 = a->bs2;
1808   PetscViewerFormat format;
1809 
1810   PetscFunctionBegin;
1811   if (A->structure_only) {
1812     PetscCall(MatView_SeqBAIJ_ASCII_structonly(A, viewer));
1813     PetscFunctionReturn(0);
1814   }
1815 
1816   PetscCall(PetscViewerGetFormat(viewer, &format));
1817   if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1818     PetscCall(PetscViewerASCIIPrintf(viewer, "  block size is %" PetscInt_FMT "\n", bs));
1819   } else if (format == PETSC_VIEWER_ASCII_MATLAB) {
1820     const char *matname;
1821     Mat         aij;
1822     PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &aij));
1823     PetscCall(PetscObjectGetName((PetscObject)A, &matname));
1824     PetscCall(PetscObjectSetName((PetscObject)aij, matname));
1825     PetscCall(MatView(aij, viewer));
1826     PetscCall(MatDestroy(&aij));
1827   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1828     PetscFunctionReturn(0);
1829   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1830     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1831     for (i = 0; i < a->mbs; i++) {
1832       for (j = 0; j < bs; j++) {
1833         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j));
1834         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1835           for (l = 0; l < bs; l++) {
1836 #if defined(PETSC_USE_COMPLEX)
1837             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1838               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1839             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1840               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1841             } else if (PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1842               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j])));
1843             }
1844 #else
1845             if (a->a[bs2 * k + l * bs + j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]));
1846 #endif
1847           }
1848         }
1849         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1850       }
1851     }
1852     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1853   } else {
1854     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1855     for (i = 0; i < a->mbs; i++) {
1856       for (j = 0; j < bs; j++) {
1857         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j));
1858         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1859           for (l = 0; l < bs; l++) {
1860 #if defined(PETSC_USE_COMPLEX)
1861             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0) {
1862               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1863             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0) {
1864               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1865             } else {
1866               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j])));
1867             }
1868 #else
1869             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]));
1870 #endif
1871           }
1872         }
1873         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1874       }
1875     }
1876     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1877   }
1878   PetscCall(PetscViewerFlush(viewer));
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 #include <petscdraw.h>
1883 static PetscErrorCode MatView_SeqBAIJ_Draw_Zoom(PetscDraw draw, void *Aa) {
1884   Mat               A = (Mat)Aa;
1885   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1886   PetscInt          row, i, j, k, l, mbs = a->mbs, color, bs = A->rmap->bs, bs2 = a->bs2;
1887   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1888   MatScalar        *aa;
1889   PetscViewer       viewer;
1890   PetscViewerFormat format;
1891 
1892   PetscFunctionBegin;
1893   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1894   PetscCall(PetscViewerGetFormat(viewer, &format));
1895   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1896 
1897   /* loop over matrix elements drawing boxes */
1898 
1899   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1900     PetscDrawCollectiveBegin(draw);
1901     /* Blue for negative, Cyan for zero and  Red for positive */
1902     color = PETSC_DRAW_BLUE;
1903     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1904       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1905         y_l = A->rmap->N - row - 1.0;
1906         y_r = y_l + 1.0;
1907         x_l = a->j[j] * bs;
1908         x_r = x_l + 1.0;
1909         aa  = a->a + j * bs2;
1910         for (k = 0; k < bs; k++) {
1911           for (l = 0; l < bs; l++) {
1912             if (PetscRealPart(*aa++) >= 0.) continue;
1913             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1914           }
1915         }
1916       }
1917     }
1918     color = PETSC_DRAW_CYAN;
1919     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1920       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1921         y_l = A->rmap->N - row - 1.0;
1922         y_r = y_l + 1.0;
1923         x_l = a->j[j] * bs;
1924         x_r = x_l + 1.0;
1925         aa  = a->a + j * bs2;
1926         for (k = 0; k < bs; k++) {
1927           for (l = 0; l < bs; l++) {
1928             if (PetscRealPart(*aa++) != 0.) continue;
1929             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1930           }
1931         }
1932       }
1933     }
1934     color = PETSC_DRAW_RED;
1935     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1936       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1937         y_l = A->rmap->N - row - 1.0;
1938         y_r = y_l + 1.0;
1939         x_l = a->j[j] * bs;
1940         x_r = x_l + 1.0;
1941         aa  = a->a + j * bs2;
1942         for (k = 0; k < bs; k++) {
1943           for (l = 0; l < bs; l++) {
1944             if (PetscRealPart(*aa++) <= 0.) continue;
1945             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1946           }
1947         }
1948       }
1949     }
1950     PetscDrawCollectiveEnd(draw);
1951   } else {
1952     /* use contour shading to indicate magnitude of values */
1953     /* first determine max of all nonzero values */
1954     PetscReal minv = 0.0, maxv = 0.0;
1955     PetscDraw popup;
1956 
1957     for (i = 0; i < a->nz * a->bs2; i++) {
1958       if (PetscAbsScalar(a->a[i]) > maxv) maxv = PetscAbsScalar(a->a[i]);
1959     }
1960     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1961     PetscCall(PetscDrawGetPopup(draw, &popup));
1962     PetscCall(PetscDrawScalePopup(popup, 0.0, maxv));
1963 
1964     PetscDrawCollectiveBegin(draw);
1965     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1966       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1967         y_l = A->rmap->N - row - 1.0;
1968         y_r = y_l + 1.0;
1969         x_l = a->j[j] * bs;
1970         x_r = x_l + 1.0;
1971         aa  = a->a + j * bs2;
1972         for (k = 0; k < bs; k++) {
1973           for (l = 0; l < bs; l++) {
1974             MatScalar v = *aa++;
1975             color       = PetscDrawRealToColor(PetscAbsScalar(v), minv, maxv);
1976             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1977           }
1978         }
1979       }
1980     }
1981     PetscDrawCollectiveEnd(draw);
1982   }
1983   PetscFunctionReturn(0);
1984 }
1985 
1986 static PetscErrorCode MatView_SeqBAIJ_Draw(Mat A, PetscViewer viewer) {
1987   PetscReal xl, yl, xr, yr, w, h;
1988   PetscDraw draw;
1989   PetscBool isnull;
1990 
1991   PetscFunctionBegin;
1992   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1993   PetscCall(PetscDrawIsNull(draw, &isnull));
1994   if (isnull) PetscFunctionReturn(0);
1995 
1996   xr = A->cmap->n;
1997   yr = A->rmap->N;
1998   h  = yr / 10.0;
1999   w  = xr / 10.0;
2000   xr += w;
2001   yr += h;
2002   xl = -w;
2003   yl = -h;
2004   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
2005   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
2006   PetscCall(PetscDrawZoom(draw, MatView_SeqBAIJ_Draw_Zoom, A));
2007   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
2008   PetscCall(PetscDrawSave(draw));
2009   PetscFunctionReturn(0);
2010 }
2011 
2012 PetscErrorCode MatView_SeqBAIJ(Mat A, PetscViewer viewer) {
2013   PetscBool iascii, isbinary, isdraw;
2014 
2015   PetscFunctionBegin;
2016   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
2017   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
2018   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
2019   if (iascii) {
2020     PetscCall(MatView_SeqBAIJ_ASCII(A, viewer));
2021   } else if (isbinary) {
2022     PetscCall(MatView_SeqBAIJ_Binary(A, viewer));
2023   } else if (isdraw) {
2024     PetscCall(MatView_SeqBAIJ_Draw(A, viewer));
2025   } else {
2026     Mat B;
2027     PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &B));
2028     PetscCall(MatView(B, viewer));
2029     PetscCall(MatDestroy(&B));
2030   }
2031   PetscFunctionReturn(0);
2032 }
2033 
2034 PetscErrorCode MatGetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) {
2035   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2036   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
2037   PetscInt    *ai = a->i, *ailen = a->ilen;
2038   PetscInt     brow, bcol, ridx, cidx, bs = A->rmap->bs, bs2 = a->bs2;
2039   MatScalar   *ap, *aa = a->a;
2040 
2041   PetscFunctionBegin;
2042   for (k = 0; k < m; k++) { /* loop over rows */
2043     row  = im[k];
2044     brow = row / bs;
2045     if (row < 0) {
2046       v += n;
2047       continue;
2048     } /* negative row */
2049     PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " too large", row);
2050     rp   = aj ? aj + ai[brow] : NULL;       /* mustn't add to NULL, that is UB */
2051     ap   = aa ? aa + bs2 * ai[brow] : NULL; /* mustn't add to NULL, that is UB */
2052     nrow = ailen[brow];
2053     for (l = 0; l < n; l++) { /* loop over columns */
2054       if (in[l] < 0) {
2055         v++;
2056         continue;
2057       } /* negative column */
2058       PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column %" PetscInt_FMT " too large", in[l]);
2059       col  = in[l];
2060       bcol = col / bs;
2061       cidx = col % bs;
2062       ridx = row % bs;
2063       high = nrow;
2064       low  = 0; /* assume unsorted */
2065       while (high - low > 5) {
2066         t = (low + high) / 2;
2067         if (rp[t] > bcol) high = t;
2068         else low = t;
2069       }
2070       for (i = low; i < high; i++) {
2071         if (rp[i] > bcol) break;
2072         if (rp[i] == bcol) {
2073           *v++ = ap[bs2 * i + bs * cidx + ridx];
2074           goto finished;
2075         }
2076       }
2077       *v++ = 0.0;
2078     finished:;
2079     }
2080   }
2081   PetscFunctionReturn(0);
2082 }
2083 
2084 PetscErrorCode MatSetValuesBlocked_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) {
2085   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2086   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, rmax, N, lastcol = -1;
2087   PetscInt          *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2088   PetscInt          *aj = a->j, nonew = a->nonew, bs2 = a->bs2, bs = A->rmap->bs, stepval;
2089   PetscBool          roworiented = a->roworiented;
2090   const PetscScalar *value       = v;
2091   MatScalar         *ap = NULL, *aa = a->a, *bap;
2092 
2093   PetscFunctionBegin;
2094   if (roworiented) {
2095     stepval = (n - 1) * bs;
2096   } else {
2097     stepval = (m - 1) * bs;
2098   }
2099   for (k = 0; k < m; k++) { /* loop over added rows */
2100     row = im[k];
2101     if (row < 0) continue;
2102     PetscCheck(row < a->mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block row index too large %" PetscInt_FMT " max %" PetscInt_FMT, row, a->mbs - 1);
2103     rp = aj + ai[row];
2104     if (!A->structure_only) ap = aa + bs2 * ai[row];
2105     rmax = imax[row];
2106     nrow = ailen[row];
2107     low  = 0;
2108     high = nrow;
2109     for (l = 0; l < n; l++) { /* loop over added columns */
2110       if (in[l] < 0) continue;
2111       PetscCheck(in[l] < a->nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block column index too large %" PetscInt_FMT " max %" PetscInt_FMT, in[l], a->nbs - 1);
2112       col = in[l];
2113       if (!A->structure_only) {
2114         if (roworiented) {
2115           value = v + (k * (stepval + bs) + l) * bs;
2116         } else {
2117           value = v + (l * (stepval + bs) + k) * bs;
2118         }
2119       }
2120       if (col <= lastcol) low = 0;
2121       else high = nrow;
2122       lastcol = col;
2123       while (high - low > 7) {
2124         t = (low + high) / 2;
2125         if (rp[t] > col) high = t;
2126         else low = t;
2127       }
2128       for (i = low; i < high; i++) {
2129         if (rp[i] > col) break;
2130         if (rp[i] == col) {
2131           if (A->structure_only) goto noinsert2;
2132           bap = ap + bs2 * i;
2133           if (roworiented) {
2134             if (is == ADD_VALUES) {
2135               for (ii = 0; ii < bs; ii++, value += stepval) {
2136                 for (jj = ii; jj < bs2; jj += bs) bap[jj] += *value++;
2137               }
2138             } else {
2139               for (ii = 0; ii < bs; ii++, value += stepval) {
2140                 for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2141               }
2142             }
2143           } else {
2144             if (is == ADD_VALUES) {
2145               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2146                 for (jj = 0; jj < bs; jj++) bap[jj] += value[jj];
2147                 bap += bs;
2148               }
2149             } else {
2150               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2151                 for (jj = 0; jj < bs; jj++) bap[jj] = value[jj];
2152                 bap += bs;
2153               }
2154             }
2155           }
2156           goto noinsert2;
2157         }
2158       }
2159       if (nonew == 1) goto noinsert2;
2160       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new blocked index new nonzero block (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
2161       if (A->structure_only) {
2162         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, row, col, rmax, ai, aj, rp, imax, nonew, MatScalar);
2163       } else {
2164         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, row, col, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2165       }
2166       N = nrow++ - 1;
2167       high++;
2168       /* shift up all the later entries in this row */
2169       PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
2170       rp[i] = col;
2171       if (!A->structure_only) {
2172         PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1)));
2173         bap = ap + bs2 * i;
2174         if (roworiented) {
2175           for (ii = 0; ii < bs; ii++, value += stepval) {
2176             for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2177           }
2178         } else {
2179           for (ii = 0; ii < bs; ii++, value += stepval) {
2180             for (jj = 0; jj < bs; jj++) *bap++ = *value++;
2181           }
2182         }
2183       }
2184     noinsert2:;
2185       low = i;
2186     }
2187     ailen[row] = nrow;
2188   }
2189   PetscFunctionReturn(0);
2190 }
2191 
2192 PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat A, MatAssemblyType mode) {
2193   Mat_SeqBAIJ *a      = (Mat_SeqBAIJ *)A->data;
2194   PetscInt     fshift = 0, i, *ai = a->i, *aj = a->j, *imax = a->imax;
2195   PetscInt     m = A->rmap->N, *ip, N, *ailen = a->ilen;
2196   PetscInt     mbs = a->mbs, bs2 = a->bs2, rmax = 0;
2197   MatScalar   *aa    = a->a, *ap;
2198   PetscReal    ratio = 0.6;
2199 
2200   PetscFunctionBegin;
2201   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
2202 
2203   if (m) rmax = ailen[0];
2204   for (i = 1; i < mbs; i++) {
2205     /* move each row back by the amount of empty slots (fshift) before it*/
2206     fshift += imax[i - 1] - ailen[i - 1];
2207     rmax = PetscMax(rmax, ailen[i]);
2208     if (fshift) {
2209       ip = aj + ai[i];
2210       ap = aa + bs2 * ai[i];
2211       N  = ailen[i];
2212       PetscCall(PetscArraymove(ip - fshift, ip, N));
2213       if (!A->structure_only) PetscCall(PetscArraymove(ap - bs2 * fshift, ap, bs2 * N));
2214     }
2215     ai[i] = ai[i - 1] + ailen[i - 1];
2216   }
2217   if (mbs) {
2218     fshift += imax[mbs - 1] - ailen[mbs - 1];
2219     ai[mbs] = ai[mbs - 1] + ailen[mbs - 1];
2220   }
2221 
2222   /* reset ilen and imax for each row */
2223   a->nonzerorowcnt = 0;
2224   if (A->structure_only) {
2225     PetscCall(PetscFree2(a->imax, a->ilen));
2226   } else { /* !A->structure_only */
2227     for (i = 0; i < mbs; i++) {
2228       ailen[i] = imax[i] = ai[i + 1] - ai[i];
2229       a->nonzerorowcnt += ((ai[i + 1] - ai[i]) > 0);
2230     }
2231   }
2232   a->nz = ai[mbs];
2233 
2234   /* diagonals may have moved, so kill the diagonal pointers */
2235   a->idiagvalid = PETSC_FALSE;
2236   if (fshift && a->diag) {
2237     PetscCall(PetscFree(a->diag));
2238     a->diag = NULL;
2239   }
2240   if (fshift) PetscCheck(a->nounused != -1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unused space detected in matrix: %" PetscInt_FMT " X %" PetscInt_FMT " block size %" PetscInt_FMT ", %" PetscInt_FMT " unneeded", m, A->cmap->n, A->rmap->bs, fshift * bs2);
2241   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT ", block size %" PetscInt_FMT "; storage space: %" PetscInt_FMT " unneeded, %" PetscInt_FMT " used\n", m, A->cmap->n, A->rmap->bs, fshift * bs2, a->nz * bs2));
2242   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues is %" PetscInt_FMT "\n", a->reallocs));
2243   PetscCall(PetscInfo(A, "Most nonzeros blocks in any row is %" PetscInt_FMT "\n", rmax));
2244 
2245   A->info.mallocs += a->reallocs;
2246   a->reallocs         = 0;
2247   A->info.nz_unneeded = (PetscReal)fshift * bs2;
2248   a->rmax             = rmax;
2249 
2250   if (!A->structure_only) PetscCall(MatCheckCompressedRow(A, a->nonzerorowcnt, &a->compressedrow, a->i, mbs, ratio));
2251   PetscFunctionReturn(0);
2252 }
2253 
2254 /*
2255    This function returns an array of flags which indicate the locations of contiguous
2256    blocks that should be zeroed. for eg: if bs = 3  and is = [0,1,2,3,5,6,7,8,9]
2257    then the resulting sizes = [3,1,1,3,1] corresponding to sets [(0,1,2),(3),(5),(6,7,8),(9)]
2258    Assume: sizes should be long enough to hold all the values.
2259 */
2260 static PetscErrorCode MatZeroRows_SeqBAIJ_Check_Blocks(PetscInt idx[], PetscInt n, PetscInt bs, PetscInt sizes[], PetscInt *bs_max) {
2261   PetscInt  i, j, k, row;
2262   PetscBool flg;
2263 
2264   PetscFunctionBegin;
2265   for (i = 0, j = 0; i < n; j++) {
2266     row = idx[i];
2267     if (row % bs != 0) { /* Not the beginning of a block */
2268       sizes[j] = 1;
2269       i++;
2270     } else if (i + bs > n) { /* complete block doesn't exist (at idx end) */
2271       sizes[j] = 1;          /* Also makes sure at least 'bs' values exist for next else */
2272       i++;
2273     } else { /* Beginning of the block, so check if the complete block exists */
2274       flg = PETSC_TRUE;
2275       for (k = 1; k < bs; k++) {
2276         if (row + k != idx[i + k]) { /* break in the block */
2277           flg = PETSC_FALSE;
2278           break;
2279         }
2280       }
2281       if (flg) { /* No break in the bs */
2282         sizes[j] = bs;
2283         i += bs;
2284       } else {
2285         sizes[j] = 1;
2286         i++;
2287       }
2288     }
2289   }
2290   *bs_max = j;
2291   PetscFunctionReturn(0);
2292 }
2293 
2294 PetscErrorCode MatZeroRows_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b) {
2295   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2296   PetscInt           i, j, k, count, *rows;
2297   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, *sizes, row, bs_max;
2298   PetscScalar        zero = 0.0;
2299   MatScalar         *aa;
2300   const PetscScalar *xx;
2301   PetscScalar       *bb;
2302 
2303   PetscFunctionBegin;
2304   /* fix right hand side if needed */
2305   if (x && b) {
2306     PetscCall(VecGetArrayRead(x, &xx));
2307     PetscCall(VecGetArray(b, &bb));
2308     for (i = 0; i < is_n; i++) bb[is_idx[i]] = diag * xx[is_idx[i]];
2309     PetscCall(VecRestoreArrayRead(x, &xx));
2310     PetscCall(VecRestoreArray(b, &bb));
2311   }
2312 
2313   /* Make a copy of the IS and  sort it */
2314   /* allocate memory for rows,sizes */
2315   PetscCall(PetscMalloc2(is_n, &rows, 2 * is_n, &sizes));
2316 
2317   /* copy IS values to rows, and sort them */
2318   for (i = 0; i < is_n; i++) rows[i] = is_idx[i];
2319   PetscCall(PetscSortInt(is_n, rows));
2320 
2321   if (baij->keepnonzeropattern) {
2322     for (i = 0; i < is_n; i++) sizes[i] = 1;
2323     bs_max = is_n;
2324   } else {
2325     PetscCall(MatZeroRows_SeqBAIJ_Check_Blocks(rows, is_n, bs, sizes, &bs_max));
2326     A->nonzerostate++;
2327   }
2328 
2329   for (i = 0, j = 0; i < bs_max; j += sizes[i], i++) {
2330     row = rows[j];
2331     PetscCheck(row >= 0 && row <= A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", row);
2332     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2333     aa    = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs);
2334     if (sizes[i] == bs && !baij->keepnonzeropattern) {
2335       if (diag != (PetscScalar)0.0) {
2336         if (baij->ilen[row / bs] > 0) {
2337           baij->ilen[row / bs]       = 1;
2338           baij->j[baij->i[row / bs]] = row / bs;
2339 
2340           PetscCall(PetscArrayzero(aa, count * bs));
2341         }
2342         /* Now insert all the diagonal values for this bs */
2343         for (k = 0; k < bs; k++) PetscCall((*A->ops->setvalues)(A, 1, rows + j + k, 1, rows + j + k, &diag, INSERT_VALUES));
2344       } else { /* (diag == 0.0) */
2345         baij->ilen[row / bs] = 0;
2346       }      /* end (diag == 0.0) */
2347     } else { /* (sizes[i] != bs) */
2348       PetscAssert(sizes[i] == 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal Error. Value should be 1");
2349       for (k = 0; k < count; k++) {
2350         aa[0] = zero;
2351         aa += bs;
2352       }
2353       if (diag != (PetscScalar)0.0) PetscCall((*A->ops->setvalues)(A, 1, rows + j, 1, rows + j, &diag, INSERT_VALUES));
2354     }
2355   }
2356 
2357   PetscCall(PetscFree2(rows, sizes));
2358   PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY));
2359   PetscFunctionReturn(0);
2360 }
2361 
2362 PetscErrorCode MatZeroRowsColumns_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b) {
2363   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2364   PetscInt           i, j, k, count;
2365   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, row, col;
2366   PetscScalar        zero = 0.0;
2367   MatScalar         *aa;
2368   const PetscScalar *xx;
2369   PetscScalar       *bb;
2370   PetscBool         *zeroed, vecs = PETSC_FALSE;
2371 
2372   PetscFunctionBegin;
2373   /* fix right hand side if needed */
2374   if (x && b) {
2375     PetscCall(VecGetArrayRead(x, &xx));
2376     PetscCall(VecGetArray(b, &bb));
2377     vecs = PETSC_TRUE;
2378   }
2379 
2380   /* zero the columns */
2381   PetscCall(PetscCalloc1(A->rmap->n, &zeroed));
2382   for (i = 0; i < is_n; i++) {
2383     PetscCheck(is_idx[i] >= 0 && is_idx[i] < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", is_idx[i]);
2384     zeroed[is_idx[i]] = PETSC_TRUE;
2385   }
2386   for (i = 0; i < A->rmap->N; i++) {
2387     if (!zeroed[i]) {
2388       row = i / bs;
2389       for (j = baij->i[row]; j < baij->i[row + 1]; j++) {
2390         for (k = 0; k < bs; k++) {
2391           col = bs * baij->j[j] + k;
2392           if (zeroed[col]) {
2393             aa = ((MatScalar *)(baij->a)) + j * bs2 + (i % bs) + bs * k;
2394             if (vecs) bb[i] -= aa[0] * xx[col];
2395             aa[0] = 0.0;
2396           }
2397         }
2398       }
2399     } else if (vecs) bb[i] = diag * xx[i];
2400   }
2401   PetscCall(PetscFree(zeroed));
2402   if (vecs) {
2403     PetscCall(VecRestoreArrayRead(x, &xx));
2404     PetscCall(VecRestoreArray(b, &bb));
2405   }
2406 
2407   /* zero the rows */
2408   for (i = 0; i < is_n; i++) {
2409     row   = is_idx[i];
2410     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2411     aa    = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs);
2412     for (k = 0; k < count; k++) {
2413       aa[0] = zero;
2414       aa += bs;
2415     }
2416     if (diag != (PetscScalar)0.0) PetscUseTypeMethod(A, setvalues, 1, &row, 1, &row, &diag, INSERT_VALUES);
2417   }
2418   PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY));
2419   PetscFunctionReturn(0);
2420 }
2421 
2422 PetscErrorCode MatSetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) {
2423   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2424   PetscInt    *rp, k, low, high, t, ii, row, nrow, i, col, l, rmax, N, lastcol = -1;
2425   PetscInt    *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2426   PetscInt    *aj = a->j, nonew = a->nonew, bs = A->rmap->bs, brow, bcol;
2427   PetscInt     ridx, cidx, bs2                 = a->bs2;
2428   PetscBool    roworiented = a->roworiented;
2429   MatScalar   *ap = NULL, value = 0.0, *aa = a->a, *bap;
2430 
2431   PetscFunctionBegin;
2432   for (k = 0; k < m; k++) { /* loop over added rows */
2433     row  = im[k];
2434     brow = row / bs;
2435     if (row < 0) continue;
2436     PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->N - 1);
2437     rp = aj + ai[brow];
2438     if (!A->structure_only) ap = aa + bs2 * ai[brow];
2439     rmax = imax[brow];
2440     nrow = ailen[brow];
2441     low  = 0;
2442     high = nrow;
2443     for (l = 0; l < n; l++) { /* loop over added columns */
2444       if (in[l] < 0) continue;
2445       PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[l], A->cmap->n - 1);
2446       col  = in[l];
2447       bcol = col / bs;
2448       ridx = row % bs;
2449       cidx = col % bs;
2450       if (!A->structure_only) {
2451         if (roworiented) {
2452           value = v[l + k * n];
2453         } else {
2454           value = v[k + l * m];
2455         }
2456       }
2457       if (col <= lastcol) low = 0;
2458       else high = nrow;
2459       lastcol = col;
2460       while (high - low > 7) {
2461         t = (low + high) / 2;
2462         if (rp[t] > bcol) high = t;
2463         else low = t;
2464       }
2465       for (i = low; i < high; i++) {
2466         if (rp[i] > bcol) break;
2467         if (rp[i] == bcol) {
2468           bap = ap + bs2 * i + bs * cidx + ridx;
2469           if (!A->structure_only) {
2470             if (is == ADD_VALUES) *bap += value;
2471             else *bap = value;
2472           }
2473           goto noinsert1;
2474         }
2475       }
2476       if (nonew == 1) goto noinsert1;
2477       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
2478       if (A->structure_only) {
2479         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, brow, bcol, rmax, ai, aj, rp, imax, nonew, MatScalar);
2480       } else {
2481         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, brow, bcol, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2482       }
2483       N = nrow++ - 1;
2484       high++;
2485       /* shift up all the later entries in this row */
2486       PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
2487       rp[i] = bcol;
2488       if (!A->structure_only) {
2489         PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1)));
2490         PetscCall(PetscArrayzero(ap + bs2 * i, bs2));
2491         ap[bs2 * i + bs * cidx + ridx] = value;
2492       }
2493       a->nz++;
2494       A->nonzerostate++;
2495     noinsert1:;
2496       low = i;
2497     }
2498     ailen[brow] = nrow;
2499   }
2500   PetscFunctionReturn(0);
2501 }
2502 
2503 PetscErrorCode MatILUFactor_SeqBAIJ(Mat inA, IS row, IS col, const MatFactorInfo *info) {
2504   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2505   Mat          outA;
2506   PetscBool    row_identity, col_identity;
2507 
2508   PetscFunctionBegin;
2509   PetscCheck(info->levels == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only levels = 0 supported for in-place ILU");
2510   PetscCall(ISIdentity(row, &row_identity));
2511   PetscCall(ISIdentity(col, &col_identity));
2512   PetscCheck(row_identity && col_identity, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column permutations must be identity for in-place ILU");
2513 
2514   outA            = inA;
2515   inA->factortype = MAT_FACTOR_LU;
2516   PetscCall(PetscFree(inA->solvertype));
2517   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &inA->solvertype));
2518 
2519   PetscCall(MatMarkDiagonal_SeqBAIJ(inA));
2520 
2521   PetscCall(PetscObjectReference((PetscObject)row));
2522   PetscCall(ISDestroy(&a->row));
2523   a->row = row;
2524   PetscCall(PetscObjectReference((PetscObject)col));
2525   PetscCall(ISDestroy(&a->col));
2526   a->col = col;
2527 
2528   /* Create the invert permutation so that it can be used in MatLUFactorNumeric() */
2529   PetscCall(ISDestroy(&a->icol));
2530   PetscCall(ISInvertPermutation(col, PETSC_DECIDE, &a->icol));
2531 
2532   PetscCall(MatSeqBAIJSetNumericFactorization_inplace(inA, (PetscBool)(row_identity && col_identity)));
2533   if (!a->solve_work) { PetscCall(PetscMalloc1(inA->rmap->N + inA->rmap->bs, &a->solve_work)); }
2534   PetscCall(MatLUFactorNumeric(outA, inA, info));
2535   PetscFunctionReturn(0);
2536 }
2537 
2538 PetscErrorCode MatSeqBAIJSetColumnIndices_SeqBAIJ(Mat mat, PetscInt *indices) {
2539   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)mat->data;
2540   PetscInt     i, nz, mbs;
2541 
2542   PetscFunctionBegin;
2543   nz  = baij->maxnz;
2544   mbs = baij->mbs;
2545   for (i = 0; i < nz; i++) baij->j[i] = indices[i];
2546   baij->nz = nz;
2547   for (i = 0; i < mbs; i++) baij->ilen[i] = baij->imax[i];
2548   PetscFunctionReturn(0);
2549 }
2550 
2551 /*@
2552     MatSeqBAIJSetColumnIndices - Set the column indices for all the rows in the matrix.
2553 
2554   Input Parameters:
2555 +  mat - the `MATSEQBAIJ` matrix
2556 -  indices - the column indices
2557 
2558   Level: advanced
2559 
2560   Notes:
2561     This can be called if you have precomputed the nonzero structure of the
2562   matrix and want to provide it to the matrix object to improve the performance
2563   of the `MatSetValues()` operation.
2564 
2565     You MUST have set the correct numbers of nonzeros per row in the call to
2566   `MatCreateSeqBAIJ()`, and the columns indices MUST be sorted.
2567 
2568     MUST be called before any calls to `MatSetValues()`
2569 
2570 .seealso: `MATSEQBAIJ`, `MatSetValues()`
2571 @*/
2572 PetscErrorCode MatSeqBAIJSetColumnIndices(Mat mat, PetscInt *indices) {
2573   PetscFunctionBegin;
2574   PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
2575   PetscValidIntPointer(indices, 2);
2576   PetscUseMethod(mat, "MatSeqBAIJSetColumnIndices_C", (Mat, PetscInt *), (mat, indices));
2577   PetscFunctionReturn(0);
2578 }
2579 
2580 PetscErrorCode MatGetRowMaxAbs_SeqBAIJ(Mat A, Vec v, PetscInt idx[]) {
2581   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2582   PetscInt     i, j, n, row, bs, *ai, *aj, mbs;
2583   PetscReal    atmp;
2584   PetscScalar *x, zero = 0.0;
2585   MatScalar   *aa;
2586   PetscInt     ncols, brow, krow, kcol;
2587 
2588   PetscFunctionBegin;
2589   /* why is this not a macro???????????????????????????????????????????????????????????????? */
2590   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2591   bs  = A->rmap->bs;
2592   aa  = a->a;
2593   ai  = a->i;
2594   aj  = a->j;
2595   mbs = a->mbs;
2596 
2597   PetscCall(VecSet(v, zero));
2598   PetscCall(VecGetArray(v, &x));
2599   PetscCall(VecGetLocalSize(v, &n));
2600   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
2601   for (i = 0; i < mbs; i++) {
2602     ncols = ai[1] - ai[0];
2603     ai++;
2604     brow = bs * i;
2605     for (j = 0; j < ncols; j++) {
2606       for (kcol = 0; kcol < bs; kcol++) {
2607         for (krow = 0; krow < bs; krow++) {
2608           atmp = PetscAbsScalar(*aa);
2609           aa++;
2610           row = brow + krow; /* row index */
2611           if (PetscAbsScalar(x[row]) < atmp) {
2612             x[row] = atmp;
2613             if (idx) idx[row] = bs * (*aj) + kcol;
2614           }
2615         }
2616       }
2617       aj++;
2618     }
2619   }
2620   PetscCall(VecRestoreArray(v, &x));
2621   PetscFunctionReturn(0);
2622 }
2623 
2624 PetscErrorCode MatCopy_SeqBAIJ(Mat A, Mat B, MatStructure str) {
2625   PetscFunctionBegin;
2626   /* If the two matrices have the same copy implementation, use fast copy. */
2627   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
2628     Mat_SeqBAIJ *a    = (Mat_SeqBAIJ *)A->data;
2629     Mat_SeqBAIJ *b    = (Mat_SeqBAIJ *)B->data;
2630     PetscInt     ambs = a->mbs, bmbs = b->mbs, abs = A->rmap->bs, bbs = B->rmap->bs, bs2 = abs * abs;
2631 
2632     PetscCheck(a->i[ambs] == b->i[bmbs], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzero blocks in matrices A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", a->i[ambs], b->i[bmbs]);
2633     PetscCheck(abs == bbs, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Block size A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", abs, bbs);
2634     PetscCall(PetscArraycpy(b->a, a->a, bs2 * a->i[ambs]));
2635     PetscCall(PetscObjectStateIncrease((PetscObject)B));
2636   } else {
2637     PetscCall(MatCopy_Basic(A, B, str));
2638   }
2639   PetscFunctionReturn(0);
2640 }
2641 
2642 PetscErrorCode MatSetUp_SeqBAIJ(Mat A) {
2643   PetscFunctionBegin;
2644   PetscCall(MatSeqBAIJSetPreallocation(A, A->rmap->bs, PETSC_DEFAULT, NULL));
2645   PetscFunctionReturn(0);
2646 }
2647 
2648 static PetscErrorCode MatSeqBAIJGetArray_SeqBAIJ(Mat A, PetscScalar *array[]) {
2649   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2650 
2651   PetscFunctionBegin;
2652   *array = a->a;
2653   PetscFunctionReturn(0);
2654 }
2655 
2656 static PetscErrorCode MatSeqBAIJRestoreArray_SeqBAIJ(Mat A, PetscScalar *array[]) {
2657   PetscFunctionBegin;
2658   *array = NULL;
2659   PetscFunctionReturn(0);
2660 }
2661 
2662 PetscErrorCode MatAXPYGetPreallocation_SeqBAIJ(Mat Y, Mat X, PetscInt *nnz) {
2663   PetscInt     bs = Y->rmap->bs, mbs = Y->rmap->N / bs;
2664   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data;
2665   Mat_SeqBAIJ *y = (Mat_SeqBAIJ *)Y->data;
2666 
2667   PetscFunctionBegin;
2668   /* Set the number of nonzeros in the new matrix */
2669   PetscCall(MatAXPYGetPreallocation_SeqX_private(mbs, x->i, x->j, y->i, y->j, nnz));
2670   PetscFunctionReturn(0);
2671 }
2672 
2673 PetscErrorCode MatAXPY_SeqBAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str) {
2674   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data, *y = (Mat_SeqBAIJ *)Y->data;
2675   PetscInt     bs = Y->rmap->bs, bs2 = bs * bs;
2676   PetscBLASInt one = 1;
2677 
2678   PetscFunctionBegin;
2679   if (str == UNKNOWN_NONZERO_PATTERN || (PetscDefined(USE_DEBUG) && str == SAME_NONZERO_PATTERN)) {
2680     PetscBool e = x->nz == y->nz && x->mbs == y->mbs && bs == X->rmap->bs ? PETSC_TRUE : PETSC_FALSE;
2681     if (e) {
2682       PetscCall(PetscArraycmp(x->i, y->i, x->mbs + 1, &e));
2683       if (e) {
2684         PetscCall(PetscArraycmp(x->j, y->j, x->i[x->mbs], &e));
2685         if (e) str = SAME_NONZERO_PATTERN;
2686       }
2687     }
2688     if (!e) PetscCheck(str != SAME_NONZERO_PATTERN, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "MatStructure is not SAME_NONZERO_PATTERN");
2689   }
2690   if (str == SAME_NONZERO_PATTERN) {
2691     PetscScalar  alpha = a;
2692     PetscBLASInt bnz;
2693     PetscCall(PetscBLASIntCast(x->nz * bs2, &bnz));
2694     PetscCallBLAS("BLASaxpy", BLASaxpy_(&bnz, &alpha, x->a, &one, y->a, &one));
2695     PetscCall(PetscObjectStateIncrease((PetscObject)Y));
2696   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2697     PetscCall(MatAXPY_Basic(Y, a, X, str));
2698   } else {
2699     Mat       B;
2700     PetscInt *nnz;
2701     PetscCheck(bs == X->rmap->bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrices must have same block size");
2702     PetscCall(PetscMalloc1(Y->rmap->N, &nnz));
2703     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2704     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2705     PetscCall(MatSetSizes(B, Y->rmap->n, Y->cmap->n, Y->rmap->N, Y->cmap->N));
2706     PetscCall(MatSetBlockSizesFromMats(B, Y, Y));
2707     PetscCall(MatSetType(B, (MatType)((PetscObject)Y)->type_name));
2708     PetscCall(MatAXPYGetPreallocation_SeqBAIJ(Y, X, nnz));
2709     PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz));
2710     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2711     PetscCall(MatHeaderMerge(Y, &B));
2712     PetscCall(PetscFree(nnz));
2713   }
2714   PetscFunctionReturn(0);
2715 }
2716 
2717 PETSC_INTERN PetscErrorCode MatConjugate_SeqBAIJ(Mat A) {
2718 #if defined(PETSC_USE_COMPLEX)
2719   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2720   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2721   MatScalar   *aa = a->a;
2722 
2723   PetscFunctionBegin;
2724   for (i = 0; i < nz; i++) aa[i] = PetscConj(aa[i]);
2725 #else
2726   PetscFunctionBegin;
2727 #endif
2728   PetscFunctionReturn(0);
2729 }
2730 
2731 PetscErrorCode MatRealPart_SeqBAIJ(Mat A) {
2732   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2733   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2734   MatScalar   *aa = a->a;
2735 
2736   PetscFunctionBegin;
2737   for (i = 0; i < nz; i++) aa[i] = PetscRealPart(aa[i]);
2738   PetscFunctionReturn(0);
2739 }
2740 
2741 PetscErrorCode MatImaginaryPart_SeqBAIJ(Mat A) {
2742   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2743   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2744   MatScalar   *aa = a->a;
2745 
2746   PetscFunctionBegin;
2747   for (i = 0; i < nz; i++) aa[i] = PetscImaginaryPart(aa[i]);
2748   PetscFunctionReturn(0);
2749 }
2750 
2751 /*
2752     Code almost identical to MatGetColumnIJ_SeqAIJ() should share common code
2753 */
2754 PetscErrorCode MatGetColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) {
2755   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2756   PetscInt     bs = A->rmap->bs, i, *collengths, *cia, *cja, n = A->cmap->n / bs, m = A->rmap->n / bs;
2757   PetscInt     nz = a->i[m], row, *jj, mr, col;
2758 
2759   PetscFunctionBegin;
2760   *nn = n;
2761   if (!ia) PetscFunctionReturn(0);
2762   PetscCheck(!symmetric, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not for BAIJ matrices");
2763   PetscCall(PetscCalloc1(n, &collengths));
2764   PetscCall(PetscMalloc1(n + 1, &cia));
2765   PetscCall(PetscMalloc1(nz, &cja));
2766   jj = a->j;
2767   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2768   cia[0] = oshift;
2769   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2770   PetscCall(PetscArrayzero(collengths, n));
2771   jj = a->j;
2772   for (row = 0; row < m; row++) {
2773     mr = a->i[row + 1] - a->i[row];
2774     for (i = 0; i < mr; i++) {
2775       col = *jj++;
2776 
2777       cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
2778     }
2779   }
2780   PetscCall(PetscFree(collengths));
2781   *ia = cia;
2782   *ja = cja;
2783   PetscFunctionReturn(0);
2784 }
2785 
2786 PetscErrorCode MatRestoreColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) {
2787   PetscFunctionBegin;
2788   if (!ia) PetscFunctionReturn(0);
2789   PetscCall(PetscFree(*ia));
2790   PetscCall(PetscFree(*ja));
2791   PetscFunctionReturn(0);
2792 }
2793 
2794 /*
2795  MatGetColumnIJ_SeqBAIJ_Color() and MatRestoreColumnIJ_SeqBAIJ_Color() are customized from
2796  MatGetColumnIJ_SeqBAIJ() and MatRestoreColumnIJ_SeqBAIJ() by adding an output
2797  spidx[], index of a->a, to be used in MatTransposeColoringCreate() and MatFDColoringCreate()
2798  */
2799 PetscErrorCode MatGetColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done) {
2800   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2801   PetscInt     i, *collengths, *cia, *cja, n = a->nbs, m = a->mbs;
2802   PetscInt     nz = a->i[m], row, *jj, mr, col;
2803   PetscInt    *cspidx;
2804 
2805   PetscFunctionBegin;
2806   *nn = n;
2807   if (!ia) PetscFunctionReturn(0);
2808 
2809   PetscCall(PetscCalloc1(n, &collengths));
2810   PetscCall(PetscMalloc1(n + 1, &cia));
2811   PetscCall(PetscMalloc1(nz, &cja));
2812   PetscCall(PetscMalloc1(nz, &cspidx));
2813   jj = a->j;
2814   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2815   cia[0] = oshift;
2816   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2817   PetscCall(PetscArrayzero(collengths, n));
2818   jj = a->j;
2819   for (row = 0; row < m; row++) {
2820     mr = a->i[row + 1] - a->i[row];
2821     for (i = 0; i < mr; i++) {
2822       col                                         = *jj++;
2823       cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
2824       cja[cia[col] + collengths[col]++ - oshift]  = row + oshift;
2825     }
2826   }
2827   PetscCall(PetscFree(collengths));
2828   *ia    = cia;
2829   *ja    = cja;
2830   *spidx = cspidx;
2831   PetscFunctionReturn(0);
2832 }
2833 
2834 PetscErrorCode MatRestoreColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done) {
2835   PetscFunctionBegin;
2836   PetscCall(MatRestoreColumnIJ_SeqBAIJ(A, oshift, symmetric, inodecompressed, n, ia, ja, done));
2837   PetscCall(PetscFree(*spidx));
2838   PetscFunctionReturn(0);
2839 }
2840 
2841 PetscErrorCode MatShift_SeqBAIJ(Mat Y, PetscScalar a) {
2842   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)Y->data;
2843 
2844   PetscFunctionBegin;
2845   if (!Y->preallocated || !aij->nz) PetscCall(MatSeqBAIJSetPreallocation(Y, Y->rmap->bs, 1, NULL));
2846   PetscCall(MatShift_Basic(Y, a));
2847   PetscFunctionReturn(0);
2848 }
2849 
2850 /* -------------------------------------------------------------------*/
2851 static struct _MatOps MatOps_Values = {
2852   MatSetValues_SeqBAIJ,
2853   MatGetRow_SeqBAIJ,
2854   MatRestoreRow_SeqBAIJ,
2855   MatMult_SeqBAIJ_N,
2856   /* 4*/ MatMultAdd_SeqBAIJ_N,
2857   MatMultTranspose_SeqBAIJ,
2858   MatMultTransposeAdd_SeqBAIJ,
2859   NULL,
2860   NULL,
2861   NULL,
2862   /* 10*/ NULL,
2863   MatLUFactor_SeqBAIJ,
2864   NULL,
2865   NULL,
2866   MatTranspose_SeqBAIJ,
2867   /* 15*/ MatGetInfo_SeqBAIJ,
2868   MatEqual_SeqBAIJ,
2869   MatGetDiagonal_SeqBAIJ,
2870   MatDiagonalScale_SeqBAIJ,
2871   MatNorm_SeqBAIJ,
2872   /* 20*/ NULL,
2873   MatAssemblyEnd_SeqBAIJ,
2874   MatSetOption_SeqBAIJ,
2875   MatZeroEntries_SeqBAIJ,
2876   /* 24*/ MatZeroRows_SeqBAIJ,
2877   NULL,
2878   NULL,
2879   NULL,
2880   NULL,
2881   /* 29*/ MatSetUp_SeqBAIJ,
2882   NULL,
2883   NULL,
2884   NULL,
2885   NULL,
2886   /* 34*/ MatDuplicate_SeqBAIJ,
2887   NULL,
2888   NULL,
2889   MatILUFactor_SeqBAIJ,
2890   NULL,
2891   /* 39*/ MatAXPY_SeqBAIJ,
2892   MatCreateSubMatrices_SeqBAIJ,
2893   MatIncreaseOverlap_SeqBAIJ,
2894   MatGetValues_SeqBAIJ,
2895   MatCopy_SeqBAIJ,
2896   /* 44*/ NULL,
2897   MatScale_SeqBAIJ,
2898   MatShift_SeqBAIJ,
2899   NULL,
2900   MatZeroRowsColumns_SeqBAIJ,
2901   /* 49*/ NULL,
2902   MatGetRowIJ_SeqBAIJ,
2903   MatRestoreRowIJ_SeqBAIJ,
2904   MatGetColumnIJ_SeqBAIJ,
2905   MatRestoreColumnIJ_SeqBAIJ,
2906   /* 54*/ MatFDColoringCreate_SeqXAIJ,
2907   NULL,
2908   NULL,
2909   NULL,
2910   MatSetValuesBlocked_SeqBAIJ,
2911   /* 59*/ MatCreateSubMatrix_SeqBAIJ,
2912   MatDestroy_SeqBAIJ,
2913   MatView_SeqBAIJ,
2914   NULL,
2915   NULL,
2916   /* 64*/ NULL,
2917   NULL,
2918   NULL,
2919   NULL,
2920   NULL,
2921   /* 69*/ MatGetRowMaxAbs_SeqBAIJ,
2922   NULL,
2923   MatConvert_Basic,
2924   NULL,
2925   NULL,
2926   /* 74*/ NULL,
2927   MatFDColoringApply_BAIJ,
2928   NULL,
2929   NULL,
2930   NULL,
2931   /* 79*/ NULL,
2932   NULL,
2933   NULL,
2934   NULL,
2935   MatLoad_SeqBAIJ,
2936   /* 84*/ NULL,
2937   NULL,
2938   NULL,
2939   NULL,
2940   NULL,
2941   /* 89*/ NULL,
2942   NULL,
2943   NULL,
2944   NULL,
2945   NULL,
2946   /* 94*/ NULL,
2947   NULL,
2948   NULL,
2949   NULL,
2950   NULL,
2951   /* 99*/ NULL,
2952   NULL,
2953   NULL,
2954   MatConjugate_SeqBAIJ,
2955   NULL,
2956   /*104*/ NULL,
2957   MatRealPart_SeqBAIJ,
2958   MatImaginaryPart_SeqBAIJ,
2959   NULL,
2960   NULL,
2961   /*109*/ NULL,
2962   NULL,
2963   NULL,
2964   NULL,
2965   MatMissingDiagonal_SeqBAIJ,
2966   /*114*/ NULL,
2967   NULL,
2968   NULL,
2969   NULL,
2970   NULL,
2971   /*119*/ NULL,
2972   NULL,
2973   MatMultHermitianTranspose_SeqBAIJ,
2974   MatMultHermitianTransposeAdd_SeqBAIJ,
2975   NULL,
2976   /*124*/ NULL,
2977   MatGetColumnReductions_SeqBAIJ,
2978   MatInvertBlockDiagonal_SeqBAIJ,
2979   NULL,
2980   NULL,
2981   /*129*/ NULL,
2982   NULL,
2983   NULL,
2984   NULL,
2985   NULL,
2986   /*134*/ NULL,
2987   NULL,
2988   NULL,
2989   NULL,
2990   NULL,
2991   /*139*/ MatSetBlockSizes_Default,
2992   NULL,
2993   NULL,
2994   MatFDColoringSetUp_SeqXAIJ,
2995   NULL,
2996   /*144*/ MatCreateMPIMatConcatenateSeqMat_SeqBAIJ,
2997   MatDestroySubMatrices_SeqBAIJ,
2998   NULL,
2999   NULL,
3000   NULL,
3001   NULL,
3002   /*150*/ NULL,
3003 };
3004 
3005 PetscErrorCode MatStoreValues_SeqBAIJ(Mat mat) {
3006   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3007   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;
3008 
3009   PetscFunctionBegin;
3010   PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
3011 
3012   /* allocate space for values if not already there */
3013   if (!aij->saved_values) { PetscCall(PetscMalloc1(nz + 1, &aij->saved_values)); }
3014 
3015   /* copy values over */
3016   PetscCall(PetscArraycpy(aij->saved_values, aij->a, nz));
3017   PetscFunctionReturn(0);
3018 }
3019 
3020 PetscErrorCode MatRetrieveValues_SeqBAIJ(Mat mat) {
3021   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3022   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;
3023 
3024   PetscFunctionBegin;
3025   PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
3026   PetscCheck(aij->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
3027 
3028   /* copy values over */
3029   PetscCall(PetscArraycpy(aij->a, aij->saved_values, nz));
3030   PetscFunctionReturn(0);
3031 }
3032 
3033 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqAIJ(Mat, MatType, MatReuse, Mat *);
3034 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqSBAIJ(Mat, MatType, MatReuse, Mat *);
3035 
3036 PetscErrorCode MatSeqBAIJSetPreallocation_SeqBAIJ(Mat B, PetscInt bs, PetscInt nz, PetscInt *nnz) {
3037   Mat_SeqBAIJ *b;
3038   PetscInt     i, mbs, nbs, bs2;
3039   PetscBool    flg = PETSC_FALSE, skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
3040 
3041   PetscFunctionBegin;
3042   if (nz >= 0 || nnz) realalloc = PETSC_TRUE;
3043   if (nz == MAT_SKIP_ALLOCATION) {
3044     skipallocation = PETSC_TRUE;
3045     nz             = 0;
3046   }
3047 
3048   PetscCall(MatSetBlockSize(B, PetscAbs(bs)));
3049   PetscCall(PetscLayoutSetUp(B->rmap));
3050   PetscCall(PetscLayoutSetUp(B->cmap));
3051   PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs));
3052 
3053   B->preallocated = PETSC_TRUE;
3054 
3055   mbs = B->rmap->n / bs;
3056   nbs = B->cmap->n / bs;
3057   bs2 = bs * bs;
3058 
3059   PetscCheck(mbs * bs == B->rmap->n && nbs * bs == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows %" PetscInt_FMT ", cols %" PetscInt_FMT " must be divisible by blocksize %" PetscInt_FMT, B->rmap->N, B->cmap->n, bs);
3060 
3061   if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3062   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nz cannot be less than 0: value %" PetscInt_FMT, nz);
3063   if (nnz) {
3064     for (i = 0; i < mbs; i++) {
3065       PetscCheck(nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, nnz[i]);
3066       PetscCheck(nnz[i] <= nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be greater than block row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, nnz[i], nbs);
3067     }
3068   }
3069 
3070   b = (Mat_SeqBAIJ *)B->data;
3071   PetscOptionsBegin(PetscObjectComm((PetscObject)B), NULL, "Optimize options for SEQBAIJ matrix 2 ", "Mat");
3072   PetscCall(PetscOptionsBool("-mat_no_unroll", "Do not optimize for block size (slow)", NULL, flg, &flg, NULL));
3073   PetscOptionsEnd();
3074 
3075   if (!flg) {
3076     switch (bs) {
3077     case 1:
3078       B->ops->mult    = MatMult_SeqBAIJ_1;
3079       B->ops->multadd = MatMultAdd_SeqBAIJ_1;
3080       break;
3081     case 2:
3082       B->ops->mult    = MatMult_SeqBAIJ_2;
3083       B->ops->multadd = MatMultAdd_SeqBAIJ_2;
3084       break;
3085     case 3:
3086       B->ops->mult    = MatMult_SeqBAIJ_3;
3087       B->ops->multadd = MatMultAdd_SeqBAIJ_3;
3088       break;
3089     case 4:
3090       B->ops->mult    = MatMult_SeqBAIJ_4;
3091       B->ops->multadd = MatMultAdd_SeqBAIJ_4;
3092       break;
3093     case 5:
3094       B->ops->mult    = MatMult_SeqBAIJ_5;
3095       B->ops->multadd = MatMultAdd_SeqBAIJ_5;
3096       break;
3097     case 6:
3098       B->ops->mult    = MatMult_SeqBAIJ_6;
3099       B->ops->multadd = MatMultAdd_SeqBAIJ_6;
3100       break;
3101     case 7:
3102       B->ops->mult    = MatMult_SeqBAIJ_7;
3103       B->ops->multadd = MatMultAdd_SeqBAIJ_7;
3104       break;
3105     case 9: {
3106       PetscInt version = 1;
3107       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3108       switch (version) {
3109 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3110       case 1:
3111         B->ops->mult    = MatMult_SeqBAIJ_9_AVX2;
3112         B->ops->multadd = MatMultAdd_SeqBAIJ_9_AVX2;
3113         PetscCall(PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3114         break;
3115 #endif
3116       default:
3117         B->ops->mult    = MatMult_SeqBAIJ_N;
3118         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3119         PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3120         break;
3121       }
3122       break;
3123     }
3124     case 11:
3125       B->ops->mult    = MatMult_SeqBAIJ_11;
3126       B->ops->multadd = MatMultAdd_SeqBAIJ_11;
3127       break;
3128     case 12: {
3129       PetscInt version = 1;
3130       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3131       switch (version) {
3132       case 1:
3133         B->ops->mult    = MatMult_SeqBAIJ_12_ver1;
3134         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3135         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3136         break;
3137       case 2:
3138         B->ops->mult    = MatMult_SeqBAIJ_12_ver2;
3139         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver2;
3140         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3141         break;
3142 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3143       case 3:
3144         B->ops->mult    = MatMult_SeqBAIJ_12_AVX2;
3145         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3146         PetscCall(PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3147         break;
3148 #endif
3149       default:
3150         B->ops->mult    = MatMult_SeqBAIJ_N;
3151         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3152         PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3153         break;
3154       }
3155       break;
3156     }
3157     case 15: {
3158       PetscInt version = 1;
3159       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3160       switch (version) {
3161       case 1:
3162         B->ops->mult = MatMult_SeqBAIJ_15_ver1;
3163         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3164         break;
3165       case 2:
3166         B->ops->mult = MatMult_SeqBAIJ_15_ver2;
3167         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3168         break;
3169       case 3:
3170         B->ops->mult = MatMult_SeqBAIJ_15_ver3;
3171         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3172         break;
3173       case 4:
3174         B->ops->mult = MatMult_SeqBAIJ_15_ver4;
3175         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3176         break;
3177       default:
3178         B->ops->mult = MatMult_SeqBAIJ_N;
3179         PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3180         break;
3181       }
3182       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3183       break;
3184     }
3185     default:
3186       B->ops->mult    = MatMult_SeqBAIJ_N;
3187       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3188       PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3189       break;
3190     }
3191   }
3192   B->ops->sor = MatSOR_SeqBAIJ;
3193   b->mbs      = mbs;
3194   b->nbs      = nbs;
3195   if (!skipallocation) {
3196     if (!b->imax) {
3197       PetscCall(PetscMalloc2(mbs, &b->imax, mbs, &b->ilen));
3198 
3199       b->free_imax_ilen = PETSC_TRUE;
3200     }
3201     /* b->ilen will count nonzeros in each block row so far. */
3202     for (i = 0; i < mbs; i++) b->ilen[i] = 0;
3203     if (!nnz) {
3204       if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3205       else if (nz < 0) nz = 1;
3206       nz = PetscMin(nz, nbs);
3207       for (i = 0; i < mbs; i++) b->imax[i] = nz;
3208       PetscCall(PetscIntMultError(nz, mbs, &nz));
3209     } else {
3210       PetscInt64 nz64 = 0;
3211       for (i = 0; i < mbs; i++) {
3212         b->imax[i] = nnz[i];
3213         nz64 += nnz[i];
3214       }
3215       PetscCall(PetscIntCast(nz64, &nz));
3216     }
3217 
3218     /* allocate the matrix space */
3219     PetscCall(MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i));
3220     if (B->structure_only) {
3221       PetscCall(PetscMalloc1(nz, &b->j));
3222       PetscCall(PetscMalloc1(B->rmap->N + 1, &b->i));
3223     } else {
3224       PetscInt nzbs2 = 0;
3225       PetscCall(PetscIntMultError(nz, bs2, &nzbs2));
3226       PetscCall(PetscMalloc3(nzbs2, &b->a, nz, &b->j, B->rmap->N + 1, &b->i));
3227       PetscCall(PetscArrayzero(b->a, nz * bs2));
3228     }
3229     PetscCall(PetscArrayzero(b->j, nz));
3230 
3231     if (B->structure_only) {
3232       b->singlemalloc = PETSC_FALSE;
3233       b->free_a       = PETSC_FALSE;
3234     } else {
3235       b->singlemalloc = PETSC_TRUE;
3236       b->free_a       = PETSC_TRUE;
3237     }
3238     b->free_ij = PETSC_TRUE;
3239 
3240     b->i[0] = 0;
3241     for (i = 1; i < mbs + 1; i++) b->i[i] = b->i[i - 1] + b->imax[i - 1];
3242 
3243   } else {
3244     b->free_a  = PETSC_FALSE;
3245     b->free_ij = PETSC_FALSE;
3246   }
3247 
3248   b->bs2              = bs2;
3249   b->mbs              = mbs;
3250   b->nz               = 0;
3251   b->maxnz            = nz;
3252   B->info.nz_unneeded = (PetscReal)b->maxnz * bs2;
3253   B->was_assembled    = PETSC_FALSE;
3254   B->assembled        = PETSC_FALSE;
3255   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
3256   PetscFunctionReturn(0);
3257 }
3258 
3259 PetscErrorCode MatSeqBAIJSetPreallocationCSR_SeqBAIJ(Mat B, PetscInt bs, const PetscInt ii[], const PetscInt jj[], const PetscScalar V[]) {
3260   PetscInt     i, m, nz, nz_max = 0, *nnz;
3261   PetscScalar *values      = NULL;
3262   PetscBool    roworiented = ((Mat_SeqBAIJ *)B->data)->roworiented;
3263 
3264   PetscFunctionBegin;
3265   PetscCheck(bs >= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Invalid block size specified, must be positive but it is %" PetscInt_FMT, bs);
3266   PetscCall(PetscLayoutSetBlockSize(B->rmap, bs));
3267   PetscCall(PetscLayoutSetBlockSize(B->cmap, bs));
3268   PetscCall(PetscLayoutSetUp(B->rmap));
3269   PetscCall(PetscLayoutSetUp(B->cmap));
3270   PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs));
3271   m = B->rmap->n / bs;
3272 
3273   PetscCheck(ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "ii[0] must be 0 but it is %" PetscInt_FMT, ii[0]);
3274   PetscCall(PetscMalloc1(m + 1, &nnz));
3275   for (i = 0; i < m; i++) {
3276     nz = ii[i + 1] - ii[i];
3277     PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative number of columns %" PetscInt_FMT, i, nz);
3278     nz_max = PetscMax(nz_max, nz);
3279     nnz[i] = nz;
3280   }
3281   PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz));
3282   PetscCall(PetscFree(nnz));
3283 
3284   values = (PetscScalar *)V;
3285   if (!values) PetscCall(PetscCalloc1(bs * bs * (nz_max + 1), &values));
3286   for (i = 0; i < m; i++) {
3287     PetscInt        ncols = ii[i + 1] - ii[i];
3288     const PetscInt *icols = jj + ii[i];
3289     if (bs == 1 || !roworiented) {
3290       const PetscScalar *svals = values + (V ? (bs * bs * ii[i]) : 0);
3291       PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, ncols, icols, svals, INSERT_VALUES));
3292     } else {
3293       PetscInt j;
3294       for (j = 0; j < ncols; j++) {
3295         const PetscScalar *svals = values + (V ? (bs * bs * (ii[i] + j)) : 0);
3296         PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, 1, &icols[j], svals, INSERT_VALUES));
3297       }
3298     }
3299   }
3300   if (!V) PetscCall(PetscFree(values));
3301   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3302   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3303   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3304   PetscFunctionReturn(0);
3305 }
3306 
3307 /*@C
3308    MatSeqBAIJGetArray - gives read/write access to the array where the data for a `MATSEQBAIJ` matrix is stored
3309 
3310    Not Collective
3311 
3312    Input Parameter:
3313 .  mat - a `MATSEQBAIJ` matrix
3314 
3315    Output Parameter:
3316 .   array - pointer to the data
3317 
3318    Level: intermediate
3319 
3320 .seealso: `MATSEQBAIJ`, `MatSeqBAIJRestoreArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3321 @*/
3322 PetscErrorCode MatSeqBAIJGetArray(Mat A, PetscScalar **array) {
3323   PetscFunctionBegin;
3324   PetscUseMethod(A, "MatSeqBAIJGetArray_C", (Mat, PetscScalar **), (A, array));
3325   PetscFunctionReturn(0);
3326 }
3327 
3328 /*@C
3329    MatSeqBAIJRestoreArray - returns access to the array where the data for a `MATSEQBAIJ` matrix is stored obtained by `MatSeqBAIJGetArray()`
3330 
3331    Not Collective
3332 
3333    Input Parameters:
3334 +  mat - a `MATSEQBAIJ` matrix
3335 -  array - pointer to the data
3336 
3337    Level: intermediate
3338 
3339 .seealso: `MatSeqBAIJGetArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3340 @*/
3341 PetscErrorCode MatSeqBAIJRestoreArray(Mat A, PetscScalar **array) {
3342   PetscFunctionBegin;
3343   PetscUseMethod(A, "MatSeqBAIJRestoreArray_C", (Mat, PetscScalar **), (A, array));
3344   PetscFunctionReturn(0);
3345 }
3346 
3347 /*MC
3348    MATSEQBAIJ - MATSEQBAIJ = "seqbaij" - A matrix type to be used for sequential block sparse matrices, based on
3349    block sparse compressed row format.
3350 
3351    Options Database Keys:
3352 + -mat_type seqbaij - sets the matrix type to "seqbaij" during a call to MatSetFromOptions()
3353 - -mat_baij_mult_version version - indicate the version of the matrix-vector product to use (0 often indicates using BLAS)
3354 
3355    Level: beginner
3356 
3357    Notes:
3358     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
3359     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
3360 
3361    Run with -info to see what version of the matrix-vector product is being used
3362 
3363 .seealso: `MatCreateSeqBAIJ()`
3364 M*/
3365 
3366 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat, MatType, MatReuse, Mat *);
3367 
3368 PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJ(Mat B) {
3369   PetscMPIInt  size;
3370   Mat_SeqBAIJ *b;
3371 
3372   PetscFunctionBegin;
3373   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
3374   PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Comm must be of size 1");
3375 
3376   PetscCall(PetscNew(&b));
3377   B->data = (void *)b;
3378   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
3379 
3380   b->row          = NULL;
3381   b->col          = NULL;
3382   b->icol         = NULL;
3383   b->reallocs     = 0;
3384   b->saved_values = NULL;
3385 
3386   b->roworiented        = PETSC_TRUE;
3387   b->nonew              = 0;
3388   b->diag               = NULL;
3389   B->spptr              = NULL;
3390   B->info.nz_unneeded   = (PetscReal)b->maxnz * b->bs2;
3391   b->keepnonzeropattern = PETSC_FALSE;
3392 
3393   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJGetArray_C", MatSeqBAIJGetArray_SeqBAIJ));
3394   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJRestoreArray_C", MatSeqBAIJRestoreArray_SeqBAIJ));
3395   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqBAIJ));
3396   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqBAIJ));
3397   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetColumnIndices_C", MatSeqBAIJSetColumnIndices_SeqBAIJ));
3398   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqaij_C", MatConvert_SeqBAIJ_SeqAIJ));
3399   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqsbaij_C", MatConvert_SeqBAIJ_SeqSBAIJ));
3400   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocation_C", MatSeqBAIJSetPreallocation_SeqBAIJ));
3401   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocationCSR_C", MatSeqBAIJSetPreallocationCSR_SeqBAIJ));
3402   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_SeqBAIJ));
3403 #if defined(PETSC_HAVE_HYPRE)
3404   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_hypre_C", MatConvert_AIJ_HYPRE));
3405 #endif
3406   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_is_C", MatConvert_XAIJ_IS));
3407   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ));
3408   PetscFunctionReturn(0);
3409 }
3410 
3411 PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) {
3412   Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data, *a = (Mat_SeqBAIJ *)A->data;
3413   PetscInt     i, mbs = a->mbs, nz = a->nz, bs2 = a->bs2;
3414 
3415   PetscFunctionBegin;
3416   PetscCheck(a->i[mbs] == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupt matrix");
3417 
3418   if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3419     c->imax           = a->imax;
3420     c->ilen           = a->ilen;
3421     c->free_imax_ilen = PETSC_FALSE;
3422   } else {
3423     PetscCall(PetscMalloc2(mbs, &c->imax, mbs, &c->ilen));
3424     for (i = 0; i < mbs; i++) {
3425       c->imax[i] = a->imax[i];
3426       c->ilen[i] = a->ilen[i];
3427     }
3428     c->free_imax_ilen = PETSC_TRUE;
3429   }
3430 
3431   /* allocate the matrix space */
3432   if (mallocmatspace) {
3433     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3434       PetscCall(PetscCalloc1(bs2 * nz, &c->a));
3435 
3436       c->i            = a->i;
3437       c->j            = a->j;
3438       c->singlemalloc = PETSC_FALSE;
3439       c->free_a       = PETSC_TRUE;
3440       c->free_ij      = PETSC_FALSE;
3441       c->parent       = A;
3442       C->preallocated = PETSC_TRUE;
3443       C->assembled    = PETSC_TRUE;
3444 
3445       PetscCall(PetscObjectReference((PetscObject)A));
3446       PetscCall(MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3447       PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3448     } else {
3449       PetscCall(PetscMalloc3(bs2 * nz, &c->a, nz, &c->j, mbs + 1, &c->i));
3450 
3451       c->singlemalloc = PETSC_TRUE;
3452       c->free_a       = PETSC_TRUE;
3453       c->free_ij      = PETSC_TRUE;
3454 
3455       PetscCall(PetscArraycpy(c->i, a->i, mbs + 1));
3456       if (mbs > 0) {
3457         PetscCall(PetscArraycpy(c->j, a->j, nz));
3458         if (cpvalues == MAT_COPY_VALUES) {
3459           PetscCall(PetscArraycpy(c->a, a->a, bs2 * nz));
3460         } else {
3461           PetscCall(PetscArrayzero(c->a, bs2 * nz));
3462         }
3463       }
3464       C->preallocated = PETSC_TRUE;
3465       C->assembled    = PETSC_TRUE;
3466     }
3467   }
3468 
3469   c->roworiented = a->roworiented;
3470   c->nonew       = a->nonew;
3471 
3472   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
3473   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
3474 
3475   c->bs2 = a->bs2;
3476   c->mbs = a->mbs;
3477   c->nbs = a->nbs;
3478 
3479   if (a->diag) {
3480     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3481       c->diag      = a->diag;
3482       c->free_diag = PETSC_FALSE;
3483     } else {
3484       PetscCall(PetscMalloc1(mbs + 1, &c->diag));
3485       for (i = 0; i < mbs; i++) c->diag[i] = a->diag[i];
3486       c->free_diag = PETSC_TRUE;
3487     }
3488   } else c->diag = NULL;
3489 
3490   c->nz         = a->nz;
3491   c->maxnz      = a->nz; /* Since we allocate exactly the right amount */
3492   c->solve_work = NULL;
3493   c->mult_work  = NULL;
3494   c->sor_workt  = NULL;
3495   c->sor_work   = NULL;
3496 
3497   c->compressedrow.use   = a->compressedrow.use;
3498   c->compressedrow.nrows = a->compressedrow.nrows;
3499   if (a->compressedrow.use) {
3500     i = a->compressedrow.nrows;
3501     PetscCall(PetscMalloc2(i + 1, &c->compressedrow.i, i + 1, &c->compressedrow.rindex));
3502     PetscCall(PetscArraycpy(c->compressedrow.i, a->compressedrow.i, i + 1));
3503     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, i));
3504   } else {
3505     c->compressedrow.use    = PETSC_FALSE;
3506     c->compressedrow.i      = NULL;
3507     c->compressedrow.rindex = NULL;
3508   }
3509   C->nonzerostate = A->nonzerostate;
3510 
3511   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
3512   PetscFunctionReturn(0);
3513 }
3514 
3515 PetscErrorCode MatDuplicate_SeqBAIJ(Mat A, MatDuplicateOption cpvalues, Mat *B) {
3516   PetscFunctionBegin;
3517   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
3518   PetscCall(MatSetSizes(*B, A->rmap->N, A->cmap->n, A->rmap->N, A->cmap->n));
3519   PetscCall(MatSetType(*B, MATSEQBAIJ));
3520   PetscCall(MatDuplicateNoCreate_SeqBAIJ(*B, A, cpvalues, PETSC_TRUE));
3521   PetscFunctionReturn(0);
3522 }
3523 
3524 /* Used for both SeqBAIJ and SeqSBAIJ matrices */
3525 PetscErrorCode MatLoad_SeqBAIJ_Binary(Mat mat, PetscViewer viewer) {
3526   PetscInt     header[4], M, N, nz, bs, m, n, mbs, nbs, rows, cols, sum, i, j, k;
3527   PetscInt    *rowidxs, *colidxs;
3528   PetscScalar *matvals;
3529 
3530   PetscFunctionBegin;
3531   PetscCall(PetscViewerSetUp(viewer));
3532 
3533   /* read matrix header */
3534   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3535   PetscCheck(header[0] == MAT_FILE_CLASSID, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3536   M  = header[1];
3537   N  = header[2];
3538   nz = header[3];
3539   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3540   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3541   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as SeqBAIJ");
3542 
3543   /* set block sizes from the viewer's .info file */
3544   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3545   /* set local and global sizes if not set already */
3546   if (mat->rmap->n < 0) mat->rmap->n = M;
3547   if (mat->cmap->n < 0) mat->cmap->n = N;
3548   if (mat->rmap->N < 0) mat->rmap->N = M;
3549   if (mat->cmap->N < 0) mat->cmap->N = N;
3550   PetscCall(PetscLayoutSetUp(mat->rmap));
3551   PetscCall(PetscLayoutSetUp(mat->cmap));
3552 
3553   /* check if the matrix sizes are correct */
3554   PetscCall(MatGetSize(mat, &rows, &cols));
3555   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3556   PetscCall(MatGetBlockSize(mat, &bs));
3557   PetscCall(MatGetLocalSize(mat, &m, &n));
3558   mbs = m / bs;
3559   nbs = n / bs;
3560 
3561   /* read in row lengths, column indices and nonzero values */
3562   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3563   PetscCall(PetscViewerBinaryRead(viewer, rowidxs + 1, m, NULL, PETSC_INT));
3564   rowidxs[0] = 0;
3565   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3566   sum = rowidxs[m];
3567   PetscCheck(sum == nz, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3568 
3569   /* read in column indices and nonzero values */
3570   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, nz, &matvals));
3571   PetscCall(PetscViewerBinaryRead(viewer, colidxs, rowidxs[m], NULL, PETSC_INT));
3572   PetscCall(PetscViewerBinaryRead(viewer, matvals, rowidxs[m], NULL, PETSC_SCALAR));
3573 
3574   {               /* preallocate matrix storage */
3575     PetscBT   bt; /* helper bit set to count nonzeros */
3576     PetscInt *nnz;
3577     PetscBool sbaij;
3578 
3579     PetscCall(PetscBTCreate(nbs, &bt));
3580     PetscCall(PetscCalloc1(mbs, &nnz));
3581     PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATSEQSBAIJ, &sbaij));
3582     for (i = 0; i < mbs; i++) {
3583       PetscCall(PetscBTMemzero(nbs, bt));
3584       for (k = 0; k < bs; k++) {
3585         PetscInt row = bs * i + k;
3586         for (j = rowidxs[row]; j < rowidxs[row + 1]; j++) {
3587           PetscInt col = colidxs[j];
3588           if (!sbaij || col >= row)
3589             if (!PetscBTLookupSet(bt, col / bs)) nnz[i]++;
3590         }
3591       }
3592     }
3593     PetscCall(PetscBTDestroy(&bt));
3594     PetscCall(MatSeqBAIJSetPreallocation(mat, bs, 0, nnz));
3595     PetscCall(MatSeqSBAIJSetPreallocation(mat, bs, 0, nnz));
3596     PetscCall(PetscFree(nnz));
3597   }
3598 
3599   /* store matrix values */
3600   for (i = 0; i < m; i++) {
3601     PetscInt row = i, s = rowidxs[i], e = rowidxs[i + 1];
3602     PetscCall((*mat->ops->setvalues)(mat, 1, &row, e - s, colidxs + s, matvals + s, INSERT_VALUES));
3603   }
3604 
3605   PetscCall(PetscFree(rowidxs));
3606   PetscCall(PetscFree2(colidxs, matvals));
3607   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
3608   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
3609   PetscFunctionReturn(0);
3610 }
3611 
3612 PetscErrorCode MatLoad_SeqBAIJ(Mat mat, PetscViewer viewer) {
3613   PetscBool isbinary;
3614 
3615   PetscFunctionBegin;
3616   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3617   PetscCheck(isbinary, PetscObjectComm((PetscObject)viewer), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)mat)->type_name);
3618   PetscCall(MatLoad_SeqBAIJ_Binary(mat, viewer));
3619   PetscFunctionReturn(0);
3620 }
3621 
3622 /*@C
3623    MatCreateSeqBAIJ - Creates a sparse matrix in `MATSEQAIJ` (block
3624    compressed row) format.  For good matrix assembly performance the
3625    user should preallocate the matrix storage by setting the parameter nz
3626    (or the array nnz).  By setting these parameters accurately, performance
3627    during matrix assembly can be increased by more than a factor of 50.
3628 
3629    Collective
3630 
3631    Input Parameters:
3632 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3633 .  bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3634           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3635 .  m - number of rows
3636 .  n - number of columns
3637 .  nz - number of nonzero blocks  per block row (same for all rows)
3638 -  nnz - array containing the number of nonzero blocks in the various block rows
3639          (possibly different for each block row) or NULL
3640 
3641    Output Parameter:
3642 .  A - the matrix
3643 
3644    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3645    MatXXXXSetPreallocation() paradigm instead of this routine directly.
3646    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3647 
3648    Options Database Keys:
3649 +   -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower)
3650 -   -mat_block_size - size of the blocks to use
3651 
3652    Level: intermediate
3653 
3654    Notes:
3655    The number of rows and columns must be divisible by blocksize.
3656 
3657    If the nnz parameter is given then the nz parameter is ignored
3658 
3659    A nonzero block is any block that as 1 or more nonzeros in it
3660 
3661    The `MATSEQBAIJ` format is fully compatible with standard Fortran 77
3662    storage.  That is, the stored row and column indices can begin at
3663    either one (as in Fortran) or zero.  See the users' manual for details.
3664 
3665    Specify the preallocated storage with either nz or nnz (not both).
3666    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3667    allocation.  See [Sparse Matrices](sec_matsparse) for details.
3668    matrices.
3669 
3670 .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`
3671 @*/
3672 PetscErrorCode MatCreateSeqBAIJ(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
3673   PetscFunctionBegin;
3674   PetscCall(MatCreate(comm, A));
3675   PetscCall(MatSetSizes(*A, m, n, m, n));
3676   PetscCall(MatSetType(*A, MATSEQBAIJ));
3677   PetscCall(MatSeqBAIJSetPreallocation(*A, bs, nz, (PetscInt *)nnz));
3678   PetscFunctionReturn(0);
3679 }
3680 
3681 /*@C
3682    MatSeqBAIJSetPreallocation - Sets the block size and expected nonzeros
3683    per row in the matrix. For good matrix assembly performance the
3684    user should preallocate the matrix storage by setting the parameter nz
3685    (or the array nnz).  By setting these parameters accurately, performance
3686    during matrix assembly can be increased by more than a factor of 50.
3687 
3688    Collective
3689 
3690    Input Parameters:
3691 +  B - the matrix
3692 .  bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3693           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3694 .  nz - number of block nonzeros per block row (same for all rows)
3695 -  nnz - array containing the number of block nonzeros in the various block rows
3696          (possibly different for each block row) or NULL
3697 
3698    Options Database Keys:
3699 +   -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower)
3700 -   -mat_block_size - size of the blocks to use
3701 
3702    Level: intermediate
3703 
3704    Notes:
3705    If the nnz parameter is given then the nz parameter is ignored
3706 
3707    You can call `MatGetInfo()` to get information on how effective the preallocation was;
3708    for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
3709    You can also run with the option -info and look for messages with the string
3710    malloc in them to see if additional memory allocation was needed.
3711 
3712    The `MATSEQBAIJ` format is fully compatible with standard Fortran 77
3713    storage.  That is, the stored row and column indices can begin at
3714    either one (as in Fortran) or zero.  See the users' manual for details.
3715 
3716    Specify the preallocated storage with either nz or nnz (not both).
3717    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3718    allocation.  See [Sparse Matrices](sec_matsparse) for details.
3719 
3720 .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`, `MatGetInfo()`
3721 @*/
3722 PetscErrorCode MatSeqBAIJSetPreallocation(Mat B, PetscInt bs, PetscInt nz, const PetscInt nnz[]) {
3723   PetscFunctionBegin;
3724   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
3725   PetscValidType(B, 1);
3726   PetscValidLogicalCollectiveInt(B, bs, 2);
3727   PetscTryMethod(B, "MatSeqBAIJSetPreallocation_C", (Mat, PetscInt, PetscInt, const PetscInt[]), (B, bs, nz, nnz));
3728   PetscFunctionReturn(0);
3729 }
3730 
3731 /*@C
3732    MatSeqBAIJSetPreallocationCSR - Creates a sparse sequential matrix in `MATSEQBAIJ` format using the given nonzero structure and (optional) numerical values
3733 
3734    Collective
3735 
3736    Input Parameters:
3737 +  B - the matrix
3738 .  i - the indices into j for the start of each local row (starts with zero)
3739 .  j - the column indices for each local row (starts with zero) these must be sorted for each row
3740 -  v - optional values in the matrix
3741 
3742    Level: advanced
3743 
3744    Notes:
3745    The order of the entries in values is specified by the `MatOption` `MAT_ROW_ORIENTED`.  For example, C programs
3746    may want to use the default `MAT_ROW_ORIENTED` of `PETSC_TRUE` and use an array v[nnz][bs][bs] where the second index is
3747    over rows within a block and the last index is over columns within a block row.  Fortran programs will likely set
3748    `MAT_ROW_ORIENTED` of `PETSC_FALSE` and use a Fortran array v(bs,bs,nnz) in which the first index is over rows within a
3749    block column and the second index is over columns within a block.
3750 
3751    Though this routine has Preallocation() in the name it also sets the exact nonzero locations of the matrix entries and usually the numerical values as well
3752 
3753 .seealso: `MatCreate()`, `MatCreateSeqBAIJ()`, `MatSetValues()`, `MatSeqBAIJSetPreallocation()`, `MATSEQBAIJ`
3754 @*/
3755 PetscErrorCode MatSeqBAIJSetPreallocationCSR(Mat B, PetscInt bs, const PetscInt i[], const PetscInt j[], const PetscScalar v[]) {
3756   PetscFunctionBegin;
3757   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
3758   PetscValidType(B, 1);
3759   PetscValidLogicalCollectiveInt(B, bs, 2);
3760   PetscTryMethod(B, "MatSeqBAIJSetPreallocationCSR_C", (Mat, PetscInt, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, bs, i, j, v));
3761   PetscFunctionReturn(0);
3762 }
3763 
3764 /*@
3765      MatCreateSeqBAIJWithArrays - Creates a `MATSEQBAIJ` matrix using matrix elements provided by the user.
3766 
3767      Collective
3768 
3769    Input Parameters:
3770 +  comm - must be an MPI communicator of size 1
3771 .  bs - size of block
3772 .  m - number of rows
3773 .  n - number of columns
3774 .  i - row indices; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row block row of the matrix
3775 .  j - column indices
3776 -  a - matrix values
3777 
3778    Output Parameter:
3779 .  mat - the matrix
3780 
3781    Level: advanced
3782 
3783    Notes:
3784        The i, j, and a arrays are not copied by this routine, the user must free these arrays
3785     once the matrix is destroyed
3786 
3787        You cannot set new nonzero locations into this matrix, that will generate an error.
3788 
3789        The i and j indices are 0 based
3790 
3791        When block size is greater than 1 the matrix values must be stored using the `MATSEQBAIJ` storage format
3792 
3793       The order of the entries in values is the same as the block compressed sparse row storage format; that is, it is
3794       the same as a three dimensional array in Fortran values(bs,bs,nnz) that contains the first column of the first
3795       block, followed by the second column of the first block etc etc.  That is, the blocks are contiguous in memory
3796       with column-major ordering within blocks.
3797 
3798 .seealso: `MatCreate()`, `MatCreateBAIJ()`, `MatCreateSeqBAIJ()`
3799 @*/
3800 PetscErrorCode MatCreateSeqBAIJWithArrays(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], Mat *mat) {
3801   PetscInt     ii;
3802   Mat_SeqBAIJ *baij;
3803 
3804   PetscFunctionBegin;
3805   PetscCheck(bs == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "block size %" PetscInt_FMT " > 1 is not supported yet", bs);
3806   if (m > 0) PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
3807 
3808   PetscCall(MatCreate(comm, mat));
3809   PetscCall(MatSetSizes(*mat, m, n, m, n));
3810   PetscCall(MatSetType(*mat, MATSEQBAIJ));
3811   PetscCall(MatSeqBAIJSetPreallocation(*mat, bs, MAT_SKIP_ALLOCATION, NULL));
3812   baij = (Mat_SeqBAIJ *)(*mat)->data;
3813   PetscCall(PetscMalloc2(m, &baij->imax, m, &baij->ilen));
3814 
3815   baij->i = i;
3816   baij->j = j;
3817   baij->a = a;
3818 
3819   baij->singlemalloc = PETSC_FALSE;
3820   baij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
3821   baij->free_a       = PETSC_FALSE;
3822   baij->free_ij      = PETSC_FALSE;
3823 
3824   for (ii = 0; ii < m; ii++) {
3825     baij->ilen[ii] = baij->imax[ii] = i[ii + 1] - i[ii];
3826     PetscCheck(i[ii + 1] - i[ii] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative row length in i (row indices) row = %" PetscInt_FMT " length = %" PetscInt_FMT, ii, i[ii + 1] - i[ii]);
3827   }
3828   if (PetscDefined(USE_DEBUG)) {
3829     for (ii = 0; ii < baij->i[m]; ii++) {
3830       PetscCheck(j[ii] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative column index at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]);
3831       PetscCheck(j[ii] <= n - 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index to large at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]);
3832     }
3833   }
3834 
3835   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
3836   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
3837   PetscFunctionReturn(0);
3838 }
3839 
3840 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_SeqBAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat) {
3841   PetscFunctionBegin;
3842   PetscCall(MatCreateMPIMatConcatenateSeqMat_MPIBAIJ(comm, inmat, n, scall, outmat));
3843   PetscFunctionReturn(0);
3844 }
3845