xref: /petsc/src/mat/impls/baij/seq/baij.c (revision 27f169480a6668db3ba90f8ef6ef68d542d113fa) !
1 /*
2     Defines the basic matrix operations for the BAIJ (compressed row)
3   matrix storage format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blockinvert.h>
8 #include <petsc/private/kernels/blockmatmult.h>
9 
10 /* defines MatSetValues_Seq_Hash(), MatAssemblyEnd_Seq_Hash(), MatSetUp_Seq_Hash() */
11 #define TYPE BAIJ
12 #define TYPE_BS
13 #include "../src/mat/impls/aij/seq/seqhashmatsetvalues.h"
14 #undef TYPE_BS
15 #define TYPE_BS _BS
16 #define TYPE_BS_ON
17 #include "../src/mat/impls/aij/seq/seqhashmatsetvalues.h"
18 #undef TYPE_BS
19 #include "../src/mat/impls/aij/seq/seqhashmat.h"
20 #undef TYPE
21 #undef TYPE_BS_ON
22 
23 #if defined(PETSC_HAVE_HYPRE)
24 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
25 #endif
26 
27 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
28 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat, MatType, MatReuse, Mat *);
29 #endif
30 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
31 
32 MatGetDiagonalMarkers(SeqBAIJ, A->rmap->bs)
33 
MatGetColumnReductions_SeqBAIJ(Mat A,PetscInt type,PetscReal * reductions)34 static PetscErrorCode MatGetColumnReductions_SeqBAIJ(Mat A, PetscInt type, PetscReal *reductions)
35 {
36   Mat_SeqBAIJ *a_aij = (Mat_SeqBAIJ *)A->data;
37   PetscInt     m, n, ib, jb, bs = A->rmap->bs;
38   MatScalar   *a_val = a_aij->a;
39 
40   PetscFunctionBegin;
41   PetscCall(MatGetSize(A, &m, &n));
42   PetscCall(PetscArrayzero(reductions, n));
43   if (type == NORM_2) {
44     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
45       for (jb = 0; jb < bs; jb++) {
46         for (ib = 0; ib < bs; ib++) {
47           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val * *a_val);
48           a_val++;
49         }
50       }
51     }
52   } else if (type == NORM_1) {
53     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
54       for (jb = 0; jb < bs; jb++) {
55         for (ib = 0; ib < bs; ib++) {
56           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val);
57           a_val++;
58         }
59       }
60     }
61   } else if (type == NORM_INFINITY) {
62     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
63       for (jb = 0; jb < bs; jb++) {
64         for (ib = 0; ib < bs; ib++) {
65           PetscInt col    = A->cmap->rstart + a_aij->j[i] * bs + jb;
66           reductions[col] = PetscMax(PetscAbsScalar(*a_val), reductions[col]);
67           a_val++;
68         }
69       }
70     }
71   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
72     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
73       for (jb = 0; jb < bs; jb++) {
74         for (ib = 0; ib < bs; ib++) {
75           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscRealPart(*a_val);
76           a_val++;
77         }
78       }
79     }
80   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
81     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
82       for (jb = 0; jb < bs; jb++) {
83         for (ib = 0; ib < bs; ib++) {
84           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscImaginaryPart(*a_val);
85           a_val++;
86         }
87       }
88     }
89   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
90   if (type == NORM_2) {
91     for (PetscInt i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
92   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
93     for (PetscInt i = 0; i < n; i++) reductions[i] /= m;
94   }
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
MatInvertBlockDiagonal_SeqBAIJ(Mat A,const PetscScalar ** values)98 static PetscErrorCode MatInvertBlockDiagonal_SeqBAIJ(Mat A, const PetscScalar **values)
99 {
100   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
101   PetscInt        i, bs = A->rmap->bs, mbs = a->mbs, ipvt[5], bs2 = bs * bs, *v_pivots;
102   MatScalar      *v     = a->a, *odiag, *diag, work[25], *v_work;
103   PetscReal       shift = 0.0;
104   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
105   const PetscInt *adiag;
106 
107   PetscFunctionBegin;
108   allowzeropivot = PetscNot(A->erroriffailure);
109 
110   if (a->idiagvalid) {
111     if (values) *values = a->idiag;
112     PetscFunctionReturn(PETSC_SUCCESS);
113   }
114   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, NULL));
115   if (!a->idiag) PetscCall(PetscMalloc1(bs2 * mbs, &a->idiag));
116   diag = a->idiag;
117   if (values) *values = a->idiag;
118   /* factor and invert each block */
119   switch (bs) {
120   case 1:
121     for (i = 0; i < mbs; i++) {
122       odiag   = v + 1 * adiag[i];
123       diag[0] = odiag[0];
124 
125       if (PetscAbsScalar(diag[0] + shift) < PETSC_MACHINE_EPSILON) {
126         PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot, row %" PetscInt_FMT " pivot value %g tolerance %g", i, (double)PetscAbsScalar(diag[0]), (double)PETSC_MACHINE_EPSILON);
127         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
128         A->factorerror_zeropivot_value = PetscAbsScalar(diag[0]);
129         A->factorerror_zeropivot_row   = i;
130         PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", i));
131       }
132 
133       diag[0] = (PetscScalar)1.0 / (diag[0] + shift);
134       diag += 1;
135     }
136     break;
137   case 2:
138     for (i = 0; i < mbs; i++) {
139       odiag   = v + 4 * adiag[i];
140       diag[0] = odiag[0];
141       diag[1] = odiag[1];
142       diag[2] = odiag[2];
143       diag[3] = odiag[3];
144       PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
145       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
146       diag += 4;
147     }
148     break;
149   case 3:
150     for (i = 0; i < mbs; i++) {
151       odiag   = v + 9 * adiag[i];
152       diag[0] = odiag[0];
153       diag[1] = odiag[1];
154       diag[2] = odiag[2];
155       diag[3] = odiag[3];
156       diag[4] = odiag[4];
157       diag[5] = odiag[5];
158       diag[6] = odiag[6];
159       diag[7] = odiag[7];
160       diag[8] = odiag[8];
161       PetscCall(PetscKernel_A_gets_inverse_A_3(diag, shift, allowzeropivot, &zeropivotdetected));
162       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
163       diag += 9;
164     }
165     break;
166   case 4:
167     for (i = 0; i < mbs; i++) {
168       odiag = v + 16 * adiag[i];
169       PetscCall(PetscArraycpy(diag, odiag, 16));
170       PetscCall(PetscKernel_A_gets_inverse_A_4(diag, shift, allowzeropivot, &zeropivotdetected));
171       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
172       diag += 16;
173     }
174     break;
175   case 5:
176     for (i = 0; i < mbs; i++) {
177       odiag = v + 25 * adiag[i];
178       PetscCall(PetscArraycpy(diag, odiag, 25));
179       PetscCall(PetscKernel_A_gets_inverse_A_5(diag, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
180       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
181       diag += 25;
182     }
183     break;
184   case 6:
185     for (i = 0; i < mbs; i++) {
186       odiag = v + 36 * adiag[i];
187       PetscCall(PetscArraycpy(diag, odiag, 36));
188       PetscCall(PetscKernel_A_gets_inverse_A_6(diag, shift, allowzeropivot, &zeropivotdetected));
189       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
190       diag += 36;
191     }
192     break;
193   case 7:
194     for (i = 0; i < mbs; i++) {
195       odiag = v + 49 * adiag[i];
196       PetscCall(PetscArraycpy(diag, odiag, 49));
197       PetscCall(PetscKernel_A_gets_inverse_A_7(diag, shift, allowzeropivot, &zeropivotdetected));
198       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
199       diag += 49;
200     }
201     break;
202   default:
203     PetscCall(PetscMalloc2(bs, &v_work, bs, &v_pivots));
204     for (i = 0; i < mbs; i++) {
205       odiag = v + bs2 * adiag[i];
206       PetscCall(PetscArraycpy(diag, odiag, bs2));
207       PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
208       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
209       diag += bs2;
210     }
211     PetscCall(PetscFree2(v_work, v_pivots));
212   }
213   a->idiagvalid = PETSC_TRUE;
214   PetscFunctionReturn(PETSC_SUCCESS);
215 }
216 
MatSOR_SeqBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)217 static PetscErrorCode MatSOR_SeqBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
218 {
219   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
220   PetscScalar       *x, *work, *w, *workt, *t;
221   const MatScalar   *v, *aa = a->a, *idiag;
222   const PetscScalar *b, *xb;
223   PetscScalar        s[7], xw[7] = {0}; /* avoid some compilers thinking xw is uninitialized */
224   PetscInt           m = a->mbs, i, i2, nz, bs = A->rmap->bs, bs2 = bs * bs, k, j, idx, it;
225   const PetscInt    *diag, *ai = a->i, *aj = a->j, *vi;
226 
227   PetscFunctionBegin;
228   its = its * lits;
229   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
230   PetscCheck(its > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Relaxation requires global its %" PetscInt_FMT " and local its %" PetscInt_FMT " both positive", its, lits);
231   PetscCheck(!fshift, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for diagonal shift");
232   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for non-trivial relaxation factor");
233   PetscCheck(!(flag & SOR_APPLY_UPPER) && !(flag & SOR_APPLY_LOWER), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for applying upper or lower triangular parts");
234 
235   if (!a->idiagvalid) PetscCall(MatInvertBlockDiagonal(A, NULL));
236 
237   if (!m) PetscFunctionReturn(PETSC_SUCCESS);
238   diag  = a->diag;
239   idiag = a->idiag;
240   k     = PetscMax(A->rmap->n, A->cmap->n);
241   if (!a->mult_work) PetscCall(PetscMalloc1(k + 1, &a->mult_work));
242   if (!a->sor_workt) PetscCall(PetscMalloc1(k, &a->sor_workt));
243   if (!a->sor_work) PetscCall(PetscMalloc1(bs, &a->sor_work));
244   work = a->mult_work;
245   t    = a->sor_workt;
246   w    = a->sor_work;
247 
248   PetscCall(VecGetArray(xx, &x));
249   PetscCall(VecGetArrayRead(bb, &b));
250 
251   if (flag & SOR_ZERO_INITIAL_GUESS) {
252     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
253       switch (bs) {
254       case 1:
255         PetscKernel_v_gets_A_times_w_1(x, idiag, b);
256         t[0] = b[0];
257         i2   = 1;
258         idiag += 1;
259         for (i = 1; i < m; i++) {
260           v    = aa + ai[i];
261           vi   = aj + ai[i];
262           nz   = diag[i] - ai[i];
263           s[0] = b[i2];
264           for (j = 0; j < nz; j++) {
265             xw[0] = x[vi[j]];
266             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
267           }
268           t[i2] = s[0];
269           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
270           x[i2] = xw[0];
271           idiag += 1;
272           i2 += 1;
273         }
274         break;
275       case 2:
276         PetscKernel_v_gets_A_times_w_2(x, idiag, b);
277         t[0] = b[0];
278         t[1] = b[1];
279         i2   = 2;
280         idiag += 4;
281         for (i = 1; i < m; i++) {
282           v    = aa + 4 * ai[i];
283           vi   = aj + ai[i];
284           nz   = diag[i] - ai[i];
285           s[0] = b[i2];
286           s[1] = b[i2 + 1];
287           for (j = 0; j < nz; j++) {
288             idx   = 2 * vi[j];
289             it    = 4 * j;
290             xw[0] = x[idx];
291             xw[1] = x[1 + idx];
292             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
293           }
294           t[i2]     = s[0];
295           t[i2 + 1] = s[1];
296           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
297           x[i2]     = xw[0];
298           x[i2 + 1] = xw[1];
299           idiag += 4;
300           i2 += 2;
301         }
302         break;
303       case 3:
304         PetscKernel_v_gets_A_times_w_3(x, idiag, b);
305         t[0] = b[0];
306         t[1] = b[1];
307         t[2] = b[2];
308         i2   = 3;
309         idiag += 9;
310         for (i = 1; i < m; i++) {
311           v    = aa + 9 * ai[i];
312           vi   = aj + ai[i];
313           nz   = diag[i] - ai[i];
314           s[0] = b[i2];
315           s[1] = b[i2 + 1];
316           s[2] = b[i2 + 2];
317           while (nz--) {
318             idx   = 3 * (*vi++);
319             xw[0] = x[idx];
320             xw[1] = x[1 + idx];
321             xw[2] = x[2 + idx];
322             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
323             v += 9;
324           }
325           t[i2]     = s[0];
326           t[i2 + 1] = s[1];
327           t[i2 + 2] = s[2];
328           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
329           x[i2]     = xw[0];
330           x[i2 + 1] = xw[1];
331           x[i2 + 2] = xw[2];
332           idiag += 9;
333           i2 += 3;
334         }
335         break;
336       case 4:
337         PetscKernel_v_gets_A_times_w_4(x, idiag, b);
338         t[0] = b[0];
339         t[1] = b[1];
340         t[2] = b[2];
341         t[3] = b[3];
342         i2   = 4;
343         idiag += 16;
344         for (i = 1; i < m; i++) {
345           v    = aa + 16 * ai[i];
346           vi   = aj + ai[i];
347           nz   = diag[i] - ai[i];
348           s[0] = b[i2];
349           s[1] = b[i2 + 1];
350           s[2] = b[i2 + 2];
351           s[3] = b[i2 + 3];
352           while (nz--) {
353             idx   = 4 * (*vi++);
354             xw[0] = x[idx];
355             xw[1] = x[1 + idx];
356             xw[2] = x[2 + idx];
357             xw[3] = x[3 + idx];
358             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
359             v += 16;
360           }
361           t[i2]     = s[0];
362           t[i2 + 1] = s[1];
363           t[i2 + 2] = s[2];
364           t[i2 + 3] = s[3];
365           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
366           x[i2]     = xw[0];
367           x[i2 + 1] = xw[1];
368           x[i2 + 2] = xw[2];
369           x[i2 + 3] = xw[3];
370           idiag += 16;
371           i2 += 4;
372         }
373         break;
374       case 5:
375         PetscKernel_v_gets_A_times_w_5(x, idiag, b);
376         t[0] = b[0];
377         t[1] = b[1];
378         t[2] = b[2];
379         t[3] = b[3];
380         t[4] = b[4];
381         i2   = 5;
382         idiag += 25;
383         for (i = 1; i < m; i++) {
384           v    = aa + 25 * ai[i];
385           vi   = aj + ai[i];
386           nz   = diag[i] - ai[i];
387           s[0] = b[i2];
388           s[1] = b[i2 + 1];
389           s[2] = b[i2 + 2];
390           s[3] = b[i2 + 3];
391           s[4] = b[i2 + 4];
392           while (nz--) {
393             idx   = 5 * (*vi++);
394             xw[0] = x[idx];
395             xw[1] = x[1 + idx];
396             xw[2] = x[2 + idx];
397             xw[3] = x[3 + idx];
398             xw[4] = x[4 + idx];
399             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
400             v += 25;
401           }
402           t[i2]     = s[0];
403           t[i2 + 1] = s[1];
404           t[i2 + 2] = s[2];
405           t[i2 + 3] = s[3];
406           t[i2 + 4] = s[4];
407           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
408           x[i2]     = xw[0];
409           x[i2 + 1] = xw[1];
410           x[i2 + 2] = xw[2];
411           x[i2 + 3] = xw[3];
412           x[i2 + 4] = xw[4];
413           idiag += 25;
414           i2 += 5;
415         }
416         break;
417       case 6:
418         PetscKernel_v_gets_A_times_w_6(x, idiag, b);
419         t[0] = b[0];
420         t[1] = b[1];
421         t[2] = b[2];
422         t[3] = b[3];
423         t[4] = b[4];
424         t[5] = b[5];
425         i2   = 6;
426         idiag += 36;
427         for (i = 1; i < m; i++) {
428           v    = aa + 36 * ai[i];
429           vi   = aj + ai[i];
430           nz   = diag[i] - ai[i];
431           s[0] = b[i2];
432           s[1] = b[i2 + 1];
433           s[2] = b[i2 + 2];
434           s[3] = b[i2 + 3];
435           s[4] = b[i2 + 4];
436           s[5] = b[i2 + 5];
437           while (nz--) {
438             idx   = 6 * (*vi++);
439             xw[0] = x[idx];
440             xw[1] = x[1 + idx];
441             xw[2] = x[2 + idx];
442             xw[3] = x[3 + idx];
443             xw[4] = x[4 + idx];
444             xw[5] = x[5 + idx];
445             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
446             v += 36;
447           }
448           t[i2]     = s[0];
449           t[i2 + 1] = s[1];
450           t[i2 + 2] = s[2];
451           t[i2 + 3] = s[3];
452           t[i2 + 4] = s[4];
453           t[i2 + 5] = s[5];
454           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
455           x[i2]     = xw[0];
456           x[i2 + 1] = xw[1];
457           x[i2 + 2] = xw[2];
458           x[i2 + 3] = xw[3];
459           x[i2 + 4] = xw[4];
460           x[i2 + 5] = xw[5];
461           idiag += 36;
462           i2 += 6;
463         }
464         break;
465       case 7:
466         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
467         t[0] = b[0];
468         t[1] = b[1];
469         t[2] = b[2];
470         t[3] = b[3];
471         t[4] = b[4];
472         t[5] = b[5];
473         t[6] = b[6];
474         i2   = 7;
475         idiag += 49;
476         for (i = 1; i < m; i++) {
477           v    = aa + 49 * ai[i];
478           vi   = aj + ai[i];
479           nz   = diag[i] - ai[i];
480           s[0] = b[i2];
481           s[1] = b[i2 + 1];
482           s[2] = b[i2 + 2];
483           s[3] = b[i2 + 3];
484           s[4] = b[i2 + 4];
485           s[5] = b[i2 + 5];
486           s[6] = b[i2 + 6];
487           while (nz--) {
488             idx   = 7 * (*vi++);
489             xw[0] = x[idx];
490             xw[1] = x[1 + idx];
491             xw[2] = x[2 + idx];
492             xw[3] = x[3 + idx];
493             xw[4] = x[4 + idx];
494             xw[5] = x[5 + idx];
495             xw[6] = x[6 + idx];
496             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
497             v += 49;
498           }
499           t[i2]     = s[0];
500           t[i2 + 1] = s[1];
501           t[i2 + 2] = s[2];
502           t[i2 + 3] = s[3];
503           t[i2 + 4] = s[4];
504           t[i2 + 5] = s[5];
505           t[i2 + 6] = s[6];
506           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
507           x[i2]     = xw[0];
508           x[i2 + 1] = xw[1];
509           x[i2 + 2] = xw[2];
510           x[i2 + 3] = xw[3];
511           x[i2 + 4] = xw[4];
512           x[i2 + 5] = xw[5];
513           x[i2 + 6] = xw[6];
514           idiag += 49;
515           i2 += 7;
516         }
517         break;
518       default:
519         PetscKernel_w_gets_Ar_times_v(bs, bs, b, idiag, x);
520         PetscCall(PetscArraycpy(t, b, bs));
521         i2 = bs;
522         idiag += bs2;
523         for (i = 1; i < m; i++) {
524           v  = aa + bs2 * ai[i];
525           vi = aj + ai[i];
526           nz = diag[i] - ai[i];
527 
528           PetscCall(PetscArraycpy(w, b + i2, bs));
529           /* copy all rows of x that are needed into contiguous space */
530           workt = work;
531           for (j = 0; j < nz; j++) {
532             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
533             workt += bs;
534           }
535           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
536           PetscCall(PetscArraycpy(t + i2, w, bs));
537           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
538 
539           idiag += bs2;
540           i2 += bs;
541         }
542         break;
543       }
544       /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
545       PetscCall(PetscLogFlops(1.0 * bs2 * a->nz));
546       xb = t;
547     } else xb = b;
548     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
549       idiag = a->idiag + bs2 * (a->mbs - 1);
550       i2    = bs * (m - 1);
551       switch (bs) {
552       case 1:
553         s[0] = xb[i2];
554         PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
555         x[i2] = xw[0];
556         i2 -= 1;
557         for (i = m - 2; i >= 0; i--) {
558           v    = aa + (diag[i] + 1);
559           vi   = aj + diag[i] + 1;
560           nz   = ai[i + 1] - diag[i] - 1;
561           s[0] = xb[i2];
562           for (j = 0; j < nz; j++) {
563             xw[0] = x[vi[j]];
564             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
565           }
566           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
567           x[i2] = xw[0];
568           idiag -= 1;
569           i2 -= 1;
570         }
571         break;
572       case 2:
573         s[0] = xb[i2];
574         s[1] = xb[i2 + 1];
575         PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
576         x[i2]     = xw[0];
577         x[i2 + 1] = xw[1];
578         i2 -= 2;
579         idiag -= 4;
580         for (i = m - 2; i >= 0; i--) {
581           v    = aa + 4 * (diag[i] + 1);
582           vi   = aj + diag[i] + 1;
583           nz   = ai[i + 1] - diag[i] - 1;
584           s[0] = xb[i2];
585           s[1] = xb[i2 + 1];
586           for (j = 0; j < nz; j++) {
587             idx   = 2 * vi[j];
588             it    = 4 * j;
589             xw[0] = x[idx];
590             xw[1] = x[1 + idx];
591             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
592           }
593           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
594           x[i2]     = xw[0];
595           x[i2 + 1] = xw[1];
596           idiag -= 4;
597           i2 -= 2;
598         }
599         break;
600       case 3:
601         s[0] = xb[i2];
602         s[1] = xb[i2 + 1];
603         s[2] = xb[i2 + 2];
604         PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
605         x[i2]     = xw[0];
606         x[i2 + 1] = xw[1];
607         x[i2 + 2] = xw[2];
608         i2 -= 3;
609         idiag -= 9;
610         for (i = m - 2; i >= 0; i--) {
611           v    = aa + 9 * (diag[i] + 1);
612           vi   = aj + diag[i] + 1;
613           nz   = ai[i + 1] - diag[i] - 1;
614           s[0] = xb[i2];
615           s[1] = xb[i2 + 1];
616           s[2] = xb[i2 + 2];
617           while (nz--) {
618             idx   = 3 * (*vi++);
619             xw[0] = x[idx];
620             xw[1] = x[1 + idx];
621             xw[2] = x[2 + idx];
622             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
623             v += 9;
624           }
625           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
626           x[i2]     = xw[0];
627           x[i2 + 1] = xw[1];
628           x[i2 + 2] = xw[2];
629           idiag -= 9;
630           i2 -= 3;
631         }
632         break;
633       case 4:
634         s[0] = xb[i2];
635         s[1] = xb[i2 + 1];
636         s[2] = xb[i2 + 2];
637         s[3] = xb[i2 + 3];
638         PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
639         x[i2]     = xw[0];
640         x[i2 + 1] = xw[1];
641         x[i2 + 2] = xw[2];
642         x[i2 + 3] = xw[3];
643         i2 -= 4;
644         idiag -= 16;
645         for (i = m - 2; i >= 0; i--) {
646           v    = aa + 16 * (diag[i] + 1);
647           vi   = aj + diag[i] + 1;
648           nz   = ai[i + 1] - diag[i] - 1;
649           s[0] = xb[i2];
650           s[1] = xb[i2 + 1];
651           s[2] = xb[i2 + 2];
652           s[3] = xb[i2 + 3];
653           while (nz--) {
654             idx   = 4 * (*vi++);
655             xw[0] = x[idx];
656             xw[1] = x[1 + idx];
657             xw[2] = x[2 + idx];
658             xw[3] = x[3 + idx];
659             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
660             v += 16;
661           }
662           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
663           x[i2]     = xw[0];
664           x[i2 + 1] = xw[1];
665           x[i2 + 2] = xw[2];
666           x[i2 + 3] = xw[3];
667           idiag -= 16;
668           i2 -= 4;
669         }
670         break;
671       case 5:
672         s[0] = xb[i2];
673         s[1] = xb[i2 + 1];
674         s[2] = xb[i2 + 2];
675         s[3] = xb[i2 + 3];
676         s[4] = xb[i2 + 4];
677         PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
678         x[i2]     = xw[0];
679         x[i2 + 1] = xw[1];
680         x[i2 + 2] = xw[2];
681         x[i2 + 3] = xw[3];
682         x[i2 + 4] = xw[4];
683         i2 -= 5;
684         idiag -= 25;
685         for (i = m - 2; i >= 0; i--) {
686           v    = aa + 25 * (diag[i] + 1);
687           vi   = aj + diag[i] + 1;
688           nz   = ai[i + 1] - diag[i] - 1;
689           s[0] = xb[i2];
690           s[1] = xb[i2 + 1];
691           s[2] = xb[i2 + 2];
692           s[3] = xb[i2 + 3];
693           s[4] = xb[i2 + 4];
694           while (nz--) {
695             idx   = 5 * (*vi++);
696             xw[0] = x[idx];
697             xw[1] = x[1 + idx];
698             xw[2] = x[2 + idx];
699             xw[3] = x[3 + idx];
700             xw[4] = x[4 + idx];
701             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
702             v += 25;
703           }
704           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
705           x[i2]     = xw[0];
706           x[i2 + 1] = xw[1];
707           x[i2 + 2] = xw[2];
708           x[i2 + 3] = xw[3];
709           x[i2 + 4] = xw[4];
710           idiag -= 25;
711           i2 -= 5;
712         }
713         break;
714       case 6:
715         s[0] = xb[i2];
716         s[1] = xb[i2 + 1];
717         s[2] = xb[i2 + 2];
718         s[3] = xb[i2 + 3];
719         s[4] = xb[i2 + 4];
720         s[5] = xb[i2 + 5];
721         PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
722         x[i2]     = xw[0];
723         x[i2 + 1] = xw[1];
724         x[i2 + 2] = xw[2];
725         x[i2 + 3] = xw[3];
726         x[i2 + 4] = xw[4];
727         x[i2 + 5] = xw[5];
728         i2 -= 6;
729         idiag -= 36;
730         for (i = m - 2; i >= 0; i--) {
731           v    = aa + 36 * (diag[i] + 1);
732           vi   = aj + diag[i] + 1;
733           nz   = ai[i + 1] - diag[i] - 1;
734           s[0] = xb[i2];
735           s[1] = xb[i2 + 1];
736           s[2] = xb[i2 + 2];
737           s[3] = xb[i2 + 3];
738           s[4] = xb[i2 + 4];
739           s[5] = xb[i2 + 5];
740           while (nz--) {
741             idx   = 6 * (*vi++);
742             xw[0] = x[idx];
743             xw[1] = x[1 + idx];
744             xw[2] = x[2 + idx];
745             xw[3] = x[3 + idx];
746             xw[4] = x[4 + idx];
747             xw[5] = x[5 + idx];
748             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
749             v += 36;
750           }
751           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
752           x[i2]     = xw[0];
753           x[i2 + 1] = xw[1];
754           x[i2 + 2] = xw[2];
755           x[i2 + 3] = xw[3];
756           x[i2 + 4] = xw[4];
757           x[i2 + 5] = xw[5];
758           idiag -= 36;
759           i2 -= 6;
760         }
761         break;
762       case 7:
763         s[0] = xb[i2];
764         s[1] = xb[i2 + 1];
765         s[2] = xb[i2 + 2];
766         s[3] = xb[i2 + 3];
767         s[4] = xb[i2 + 4];
768         s[5] = xb[i2 + 5];
769         s[6] = xb[i2 + 6];
770         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
771         x[i2]     = xw[0];
772         x[i2 + 1] = xw[1];
773         x[i2 + 2] = xw[2];
774         x[i2 + 3] = xw[3];
775         x[i2 + 4] = xw[4];
776         x[i2 + 5] = xw[5];
777         x[i2 + 6] = xw[6];
778         i2 -= 7;
779         idiag -= 49;
780         for (i = m - 2; i >= 0; i--) {
781           v    = aa + 49 * (diag[i] + 1);
782           vi   = aj + diag[i] + 1;
783           nz   = ai[i + 1] - diag[i] - 1;
784           s[0] = xb[i2];
785           s[1] = xb[i2 + 1];
786           s[2] = xb[i2 + 2];
787           s[3] = xb[i2 + 3];
788           s[4] = xb[i2 + 4];
789           s[5] = xb[i2 + 5];
790           s[6] = xb[i2 + 6];
791           while (nz--) {
792             idx   = 7 * (*vi++);
793             xw[0] = x[idx];
794             xw[1] = x[1 + idx];
795             xw[2] = x[2 + idx];
796             xw[3] = x[3 + idx];
797             xw[4] = x[4 + idx];
798             xw[5] = x[5 + idx];
799             xw[6] = x[6 + idx];
800             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
801             v += 49;
802           }
803           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
804           x[i2]     = xw[0];
805           x[i2 + 1] = xw[1];
806           x[i2 + 2] = xw[2];
807           x[i2 + 3] = xw[3];
808           x[i2 + 4] = xw[4];
809           x[i2 + 5] = xw[5];
810           x[i2 + 6] = xw[6];
811           idiag -= 49;
812           i2 -= 7;
813         }
814         break;
815       default:
816         PetscCall(PetscArraycpy(w, xb + i2, bs));
817         PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
818         i2 -= bs;
819         idiag -= bs2;
820         for (i = m - 2; i >= 0; i--) {
821           v  = aa + bs2 * (diag[i] + 1);
822           vi = aj + diag[i] + 1;
823           nz = ai[i + 1] - diag[i] - 1;
824 
825           PetscCall(PetscArraycpy(w, xb + i2, bs));
826           /* copy all rows of x that are needed into contiguous space */
827           workt = work;
828           for (j = 0; j < nz; j++) {
829             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
830             workt += bs;
831           }
832           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
833           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
834 
835           idiag -= bs2;
836           i2 -= bs;
837         }
838         break;
839       }
840       PetscCall(PetscLogFlops(1.0 * bs2 * (a->nz)));
841     }
842     its--;
843   }
844   while (its--) {
845     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
846       idiag = a->idiag;
847       i2    = 0;
848       switch (bs) {
849       case 1:
850         for (i = 0; i < m; i++) {
851           v    = aa + ai[i];
852           vi   = aj + ai[i];
853           nz   = ai[i + 1] - ai[i];
854           s[0] = b[i2];
855           for (j = 0; j < nz; j++) {
856             xw[0] = x[vi[j]];
857             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
858           }
859           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
860           x[i2] += xw[0];
861           idiag += 1;
862           i2 += 1;
863         }
864         break;
865       case 2:
866         for (i = 0; i < m; i++) {
867           v    = aa + 4 * ai[i];
868           vi   = aj + ai[i];
869           nz   = ai[i + 1] - ai[i];
870           s[0] = b[i2];
871           s[1] = b[i2 + 1];
872           for (j = 0; j < nz; j++) {
873             idx   = 2 * vi[j];
874             it    = 4 * j;
875             xw[0] = x[idx];
876             xw[1] = x[1 + idx];
877             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
878           }
879           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
880           x[i2] += xw[0];
881           x[i2 + 1] += xw[1];
882           idiag += 4;
883           i2 += 2;
884         }
885         break;
886       case 3:
887         for (i = 0; i < m; i++) {
888           v    = aa + 9 * ai[i];
889           vi   = aj + ai[i];
890           nz   = ai[i + 1] - ai[i];
891           s[0] = b[i2];
892           s[1] = b[i2 + 1];
893           s[2] = b[i2 + 2];
894           while (nz--) {
895             idx   = 3 * (*vi++);
896             xw[0] = x[idx];
897             xw[1] = x[1 + idx];
898             xw[2] = x[2 + idx];
899             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
900             v += 9;
901           }
902           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
903           x[i2] += xw[0];
904           x[i2 + 1] += xw[1];
905           x[i2 + 2] += xw[2];
906           idiag += 9;
907           i2 += 3;
908         }
909         break;
910       case 4:
911         for (i = 0; i < m; i++) {
912           v    = aa + 16 * ai[i];
913           vi   = aj + ai[i];
914           nz   = ai[i + 1] - ai[i];
915           s[0] = b[i2];
916           s[1] = b[i2 + 1];
917           s[2] = b[i2 + 2];
918           s[3] = b[i2 + 3];
919           while (nz--) {
920             idx   = 4 * (*vi++);
921             xw[0] = x[idx];
922             xw[1] = x[1 + idx];
923             xw[2] = x[2 + idx];
924             xw[3] = x[3 + idx];
925             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
926             v += 16;
927           }
928           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
929           x[i2] += xw[0];
930           x[i2 + 1] += xw[1];
931           x[i2 + 2] += xw[2];
932           x[i2 + 3] += xw[3];
933           idiag += 16;
934           i2 += 4;
935         }
936         break;
937       case 5:
938         for (i = 0; i < m; i++) {
939           v    = aa + 25 * ai[i];
940           vi   = aj + ai[i];
941           nz   = ai[i + 1] - ai[i];
942           s[0] = b[i2];
943           s[1] = b[i2 + 1];
944           s[2] = b[i2 + 2];
945           s[3] = b[i2 + 3];
946           s[4] = b[i2 + 4];
947           while (nz--) {
948             idx   = 5 * (*vi++);
949             xw[0] = x[idx];
950             xw[1] = x[1 + idx];
951             xw[2] = x[2 + idx];
952             xw[3] = x[3 + idx];
953             xw[4] = x[4 + idx];
954             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
955             v += 25;
956           }
957           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
958           x[i2] += xw[0];
959           x[i2 + 1] += xw[1];
960           x[i2 + 2] += xw[2];
961           x[i2 + 3] += xw[3];
962           x[i2 + 4] += xw[4];
963           idiag += 25;
964           i2 += 5;
965         }
966         break;
967       case 6:
968         for (i = 0; i < m; i++) {
969           v    = aa + 36 * ai[i];
970           vi   = aj + ai[i];
971           nz   = ai[i + 1] - ai[i];
972           s[0] = b[i2];
973           s[1] = b[i2 + 1];
974           s[2] = b[i2 + 2];
975           s[3] = b[i2 + 3];
976           s[4] = b[i2 + 4];
977           s[5] = b[i2 + 5];
978           while (nz--) {
979             idx   = 6 * (*vi++);
980             xw[0] = x[idx];
981             xw[1] = x[1 + idx];
982             xw[2] = x[2 + idx];
983             xw[3] = x[3 + idx];
984             xw[4] = x[4 + idx];
985             xw[5] = x[5 + idx];
986             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
987             v += 36;
988           }
989           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
990           x[i2] += xw[0];
991           x[i2 + 1] += xw[1];
992           x[i2 + 2] += xw[2];
993           x[i2 + 3] += xw[3];
994           x[i2 + 4] += xw[4];
995           x[i2 + 5] += xw[5];
996           idiag += 36;
997           i2 += 6;
998         }
999         break;
1000       case 7:
1001         for (i = 0; i < m; i++) {
1002           v    = aa + 49 * ai[i];
1003           vi   = aj + ai[i];
1004           nz   = ai[i + 1] - ai[i];
1005           s[0] = b[i2];
1006           s[1] = b[i2 + 1];
1007           s[2] = b[i2 + 2];
1008           s[3] = b[i2 + 3];
1009           s[4] = b[i2 + 4];
1010           s[5] = b[i2 + 5];
1011           s[6] = b[i2 + 6];
1012           while (nz--) {
1013             idx   = 7 * (*vi++);
1014             xw[0] = x[idx];
1015             xw[1] = x[1 + idx];
1016             xw[2] = x[2 + idx];
1017             xw[3] = x[3 + idx];
1018             xw[4] = x[4 + idx];
1019             xw[5] = x[5 + idx];
1020             xw[6] = x[6 + idx];
1021             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1022             v += 49;
1023           }
1024           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1025           x[i2] += xw[0];
1026           x[i2 + 1] += xw[1];
1027           x[i2 + 2] += xw[2];
1028           x[i2 + 3] += xw[3];
1029           x[i2 + 4] += xw[4];
1030           x[i2 + 5] += xw[5];
1031           x[i2 + 6] += xw[6];
1032           idiag += 49;
1033           i2 += 7;
1034         }
1035         break;
1036       default:
1037         for (i = 0; i < m; i++) {
1038           v  = aa + bs2 * ai[i];
1039           vi = aj + ai[i];
1040           nz = ai[i + 1] - ai[i];
1041 
1042           PetscCall(PetscArraycpy(w, b + i2, bs));
1043           /* copy all rows of x that are needed into contiguous space */
1044           workt = work;
1045           for (j = 0; j < nz; j++) {
1046             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
1047             workt += bs;
1048           }
1049           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1050           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);
1051 
1052           idiag += bs2;
1053           i2 += bs;
1054         }
1055         break;
1056       }
1057       PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1058     }
1059     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
1060       idiag = a->idiag + bs2 * (a->mbs - 1);
1061       i2    = bs * (m - 1);
1062       switch (bs) {
1063       case 1:
1064         for (i = m - 1; i >= 0; i--) {
1065           v    = aa + ai[i];
1066           vi   = aj + ai[i];
1067           nz   = ai[i + 1] - ai[i];
1068           s[0] = b[i2];
1069           for (j = 0; j < nz; j++) {
1070             xw[0] = x[vi[j]];
1071             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
1072           }
1073           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
1074           x[i2] += xw[0];
1075           idiag -= 1;
1076           i2 -= 1;
1077         }
1078         break;
1079       case 2:
1080         for (i = m - 1; i >= 0; i--) {
1081           v    = aa + 4 * ai[i];
1082           vi   = aj + ai[i];
1083           nz   = ai[i + 1] - ai[i];
1084           s[0] = b[i2];
1085           s[1] = b[i2 + 1];
1086           for (j = 0; j < nz; j++) {
1087             idx   = 2 * vi[j];
1088             it    = 4 * j;
1089             xw[0] = x[idx];
1090             xw[1] = x[1 + idx];
1091             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
1092           }
1093           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
1094           x[i2] += xw[0];
1095           x[i2 + 1] += xw[1];
1096           idiag -= 4;
1097           i2 -= 2;
1098         }
1099         break;
1100       case 3:
1101         for (i = m - 1; i >= 0; i--) {
1102           v    = aa + 9 * ai[i];
1103           vi   = aj + ai[i];
1104           nz   = ai[i + 1] - ai[i];
1105           s[0] = b[i2];
1106           s[1] = b[i2 + 1];
1107           s[2] = b[i2 + 2];
1108           while (nz--) {
1109             idx   = 3 * (*vi++);
1110             xw[0] = x[idx];
1111             xw[1] = x[1 + idx];
1112             xw[2] = x[2 + idx];
1113             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
1114             v += 9;
1115           }
1116           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
1117           x[i2] += xw[0];
1118           x[i2 + 1] += xw[1];
1119           x[i2 + 2] += xw[2];
1120           idiag -= 9;
1121           i2 -= 3;
1122         }
1123         break;
1124       case 4:
1125         for (i = m - 1; i >= 0; i--) {
1126           v    = aa + 16 * ai[i];
1127           vi   = aj + ai[i];
1128           nz   = ai[i + 1] - ai[i];
1129           s[0] = b[i2];
1130           s[1] = b[i2 + 1];
1131           s[2] = b[i2 + 2];
1132           s[3] = b[i2 + 3];
1133           while (nz--) {
1134             idx   = 4 * (*vi++);
1135             xw[0] = x[idx];
1136             xw[1] = x[1 + idx];
1137             xw[2] = x[2 + idx];
1138             xw[3] = x[3 + idx];
1139             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
1140             v += 16;
1141           }
1142           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
1143           x[i2] += xw[0];
1144           x[i2 + 1] += xw[1];
1145           x[i2 + 2] += xw[2];
1146           x[i2 + 3] += xw[3];
1147           idiag -= 16;
1148           i2 -= 4;
1149         }
1150         break;
1151       case 5:
1152         for (i = m - 1; i >= 0; i--) {
1153           v    = aa + 25 * ai[i];
1154           vi   = aj + ai[i];
1155           nz   = ai[i + 1] - ai[i];
1156           s[0] = b[i2];
1157           s[1] = b[i2 + 1];
1158           s[2] = b[i2 + 2];
1159           s[3] = b[i2 + 3];
1160           s[4] = b[i2 + 4];
1161           while (nz--) {
1162             idx   = 5 * (*vi++);
1163             xw[0] = x[idx];
1164             xw[1] = x[1 + idx];
1165             xw[2] = x[2 + idx];
1166             xw[3] = x[3 + idx];
1167             xw[4] = x[4 + idx];
1168             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
1169             v += 25;
1170           }
1171           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
1172           x[i2] += xw[0];
1173           x[i2 + 1] += xw[1];
1174           x[i2 + 2] += xw[2];
1175           x[i2 + 3] += xw[3];
1176           x[i2 + 4] += xw[4];
1177           idiag -= 25;
1178           i2 -= 5;
1179         }
1180         break;
1181       case 6:
1182         for (i = m - 1; i >= 0; i--) {
1183           v    = aa + 36 * ai[i];
1184           vi   = aj + ai[i];
1185           nz   = ai[i + 1] - ai[i];
1186           s[0] = b[i2];
1187           s[1] = b[i2 + 1];
1188           s[2] = b[i2 + 2];
1189           s[3] = b[i2 + 3];
1190           s[4] = b[i2 + 4];
1191           s[5] = b[i2 + 5];
1192           while (nz--) {
1193             idx   = 6 * (*vi++);
1194             xw[0] = x[idx];
1195             xw[1] = x[1 + idx];
1196             xw[2] = x[2 + idx];
1197             xw[3] = x[3 + idx];
1198             xw[4] = x[4 + idx];
1199             xw[5] = x[5 + idx];
1200             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
1201             v += 36;
1202           }
1203           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
1204           x[i2] += xw[0];
1205           x[i2 + 1] += xw[1];
1206           x[i2 + 2] += xw[2];
1207           x[i2 + 3] += xw[3];
1208           x[i2 + 4] += xw[4];
1209           x[i2 + 5] += xw[5];
1210           idiag -= 36;
1211           i2 -= 6;
1212         }
1213         break;
1214       case 7:
1215         for (i = m - 1; i >= 0; i--) {
1216           v    = aa + 49 * ai[i];
1217           vi   = aj + ai[i];
1218           nz   = ai[i + 1] - ai[i];
1219           s[0] = b[i2];
1220           s[1] = b[i2 + 1];
1221           s[2] = b[i2 + 2];
1222           s[3] = b[i2 + 3];
1223           s[4] = b[i2 + 4];
1224           s[5] = b[i2 + 5];
1225           s[6] = b[i2 + 6];
1226           while (nz--) {
1227             idx   = 7 * (*vi++);
1228             xw[0] = x[idx];
1229             xw[1] = x[1 + idx];
1230             xw[2] = x[2 + idx];
1231             xw[3] = x[3 + idx];
1232             xw[4] = x[4 + idx];
1233             xw[5] = x[5 + idx];
1234             xw[6] = x[6 + idx];
1235             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1236             v += 49;
1237           }
1238           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1239           x[i2] += xw[0];
1240           x[i2 + 1] += xw[1];
1241           x[i2 + 2] += xw[2];
1242           x[i2 + 3] += xw[3];
1243           x[i2 + 4] += xw[4];
1244           x[i2 + 5] += xw[5];
1245           x[i2 + 6] += xw[6];
1246           idiag -= 49;
1247           i2 -= 7;
1248         }
1249         break;
1250       default:
1251         for (i = m - 1; i >= 0; i--) {
1252           v  = aa + bs2 * ai[i];
1253           vi = aj + ai[i];
1254           nz = ai[i + 1] - ai[i];
1255 
1256           PetscCall(PetscArraycpy(w, b + i2, bs));
1257           /* copy all rows of x that are needed into contiguous space */
1258           workt = work;
1259           for (j = 0; j < nz; j++) {
1260             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
1261             workt += bs;
1262           }
1263           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1264           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);
1265 
1266           idiag -= bs2;
1267           i2 -= bs;
1268         }
1269         break;
1270       }
1271       PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz)));
1272     }
1273   }
1274   PetscCall(VecRestoreArray(xx, &x));
1275   PetscCall(VecRestoreArrayRead(bb, &b));
1276   PetscFunctionReturn(PETSC_SUCCESS);
1277 }
1278 
1279 /*
1280     Special version for direct calls from Fortran (Used in PETSc-fun3d)
1281 */
1282 #if defined(PETSC_HAVE_FORTRAN_CAPS)
1283   #define matsetvaluesblocked4_ MATSETVALUESBLOCKED4
1284 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1285   #define matsetvaluesblocked4_ matsetvaluesblocked4
1286 #endif
1287 
matsetvaluesblocked4_(Mat * AA,PetscInt * mm,const PetscInt im[],PetscInt * nn,const PetscInt in[],const PetscScalar v[])1288 PETSC_EXTERN void matsetvaluesblocked4_(Mat *AA, PetscInt *mm, const PetscInt im[], PetscInt *nn, const PetscInt in[], const PetscScalar v[])
1289 {
1290   Mat                A = *AA;
1291   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1292   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, N, m = *mm, n = *nn;
1293   PetscInt          *ai = a->i, *ailen = a->ilen;
1294   PetscInt          *aj = a->j, stepval, lastcol = -1;
1295   const PetscScalar *value = v;
1296   MatScalar         *ap, *aa = a->a, *bap;
1297 
1298   PetscFunctionBegin;
1299   if (A->rmap->bs != 4) SETERRABORT(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Can only be called with a block size of 4");
1300   stepval = (n - 1) * 4;
1301   for (k = 0; k < m; k++) { /* loop over added rows */
1302     row  = im[k];
1303     rp   = aj + ai[row];
1304     ap   = aa + 16 * ai[row];
1305     nrow = ailen[row];
1306     low  = 0;
1307     high = nrow;
1308     for (l = 0; l < n; l++) { /* loop over added columns */
1309       col = in[l];
1310       if (col <= lastcol) low = 0;
1311       else high = nrow;
1312       lastcol = col;
1313       value   = v + k * (stepval + 4 + l) * 4;
1314       while (high - low > 7) {
1315         t = (low + high) / 2;
1316         if (rp[t] > col) high = t;
1317         else low = t;
1318       }
1319       for (i = low; i < high; i++) {
1320         if (rp[i] > col) break;
1321         if (rp[i] == col) {
1322           bap = ap + 16 * i;
1323           for (ii = 0; ii < 4; ii++, value += stepval) {
1324             for (jj = ii; jj < 16; jj += 4) bap[jj] += *value++;
1325           }
1326           goto noinsert2;
1327         }
1328       }
1329       N = nrow++ - 1;
1330       high++; /* added new column index thus must search to one higher than before */
1331       /* shift up all the later entries in this row */
1332       for (ii = N; ii >= i; ii--) {
1333         rp[ii + 1] = rp[ii];
1334         PetscCallVoid(PetscArraycpy(ap + 16 * (ii + 1), ap + 16 * (ii), 16));
1335       }
1336       if (N >= i) PetscCallVoid(PetscArrayzero(ap + 16 * i, 16));
1337       rp[i] = col;
1338       bap   = ap + 16 * i;
1339       for (ii = 0; ii < 4; ii++, value += stepval) {
1340         for (jj = ii; jj < 16; jj += 4) bap[jj] = *value++;
1341       }
1342     noinsert2:;
1343       low = i;
1344     }
1345     ailen[row] = nrow;
1346   }
1347   PetscFunctionReturnVoid();
1348 }
1349 
1350 #if defined(PETSC_HAVE_FORTRAN_CAPS)
1351   #define matsetvalues4_ MATSETVALUES4
1352 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1353   #define matsetvalues4_ matsetvalues4
1354 #endif
1355 
matsetvalues4_(Mat * AA,PetscInt * mm,PetscInt * im,PetscInt * nn,PetscInt * in,PetscScalar * v)1356 PETSC_EXTERN void matsetvalues4_(Mat *AA, PetscInt *mm, PetscInt *im, PetscInt *nn, PetscInt *in, PetscScalar *v)
1357 {
1358   Mat          A = *AA;
1359   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1360   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, N, n = *nn, m = *mm;
1361   PetscInt    *ai = a->i, *ailen = a->ilen;
1362   PetscInt    *aj = a->j, brow, bcol;
1363   PetscInt     ridx, cidx, lastcol = -1;
1364   MatScalar   *ap, value, *aa      = a->a, *bap;
1365 
1366   PetscFunctionBegin;
1367   for (k = 0; k < m; k++) { /* loop over added rows */
1368     row  = im[k];
1369     brow = row / 4;
1370     rp   = aj + ai[brow];
1371     ap   = aa + 16 * ai[brow];
1372     nrow = ailen[brow];
1373     low  = 0;
1374     high = nrow;
1375     for (l = 0; l < n; l++) { /* loop over added columns */
1376       col   = in[l];
1377       bcol  = col / 4;
1378       ridx  = row % 4;
1379       cidx  = col % 4;
1380       value = v[l + k * n];
1381       if (col <= lastcol) low = 0;
1382       else high = nrow;
1383       lastcol = col;
1384       while (high - low > 7) {
1385         t = (low + high) / 2;
1386         if (rp[t] > bcol) high = t;
1387         else low = t;
1388       }
1389       for (i = low; i < high; i++) {
1390         if (rp[i] > bcol) break;
1391         if (rp[i] == bcol) {
1392           bap = ap + 16 * i + 4 * cidx + ridx;
1393           *bap += value;
1394           goto noinsert1;
1395         }
1396       }
1397       N = nrow++ - 1;
1398       high++; /* added new column thus must search to one higher than before */
1399       /* shift up all the later entries in this row */
1400       PetscCallVoid(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
1401       PetscCallVoid(PetscArraymove(ap + 16 * i + 16, ap + 16 * i, 16 * (N - i + 1)));
1402       PetscCallVoid(PetscArrayzero(ap + 16 * i, 16));
1403       rp[i]                        = bcol;
1404       ap[16 * i + 4 * cidx + ridx] = value;
1405     noinsert1:;
1406       low = i;
1407     }
1408     ailen[brow] = nrow;
1409   }
1410   PetscFunctionReturnVoid();
1411 }
1412 
MatGetRowIJ_SeqBAIJ(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * nn,const PetscInt * inia[],const PetscInt * inja[],PetscBool * done)1413 static PetscErrorCode MatGetRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *inia[], const PetscInt *inja[], PetscBool *done)
1414 {
1415   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1416   PetscInt     i, j, n = a->mbs, nz = a->i[n], *tia, *tja, bs = A->rmap->bs, k, l, cnt;
1417   PetscInt   **ia = (PetscInt **)inia, **ja = (PetscInt **)inja;
1418 
1419   PetscFunctionBegin;
1420   *nn = n;
1421   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
1422   if (symmetric) {
1423     PetscCall(MatToSymmetricIJ_SeqAIJ(n, a->i, a->j, PETSC_TRUE, 0, 0, &tia, &tja));
1424     nz = tia[n];
1425   } else {
1426     tia = a->i;
1427     tja = a->j;
1428   }
1429 
1430   if (!blockcompressed && bs > 1) {
1431     (*nn) *= bs;
1432     /* malloc & create the natural set of indices */
1433     PetscCall(PetscMalloc1((n + 1) * bs, ia));
1434     if (n) {
1435       (*ia)[0] = oshift;
1436       for (j = 1; j < bs; j++) (*ia)[j] = (tia[1] - tia[0]) * bs + (*ia)[j - 1];
1437     }
1438 
1439     for (i = 1; i < n; i++) {
1440       (*ia)[i * bs] = (tia[i] - tia[i - 1]) * bs + (*ia)[i * bs - 1];
1441       for (j = 1; j < bs; j++) (*ia)[i * bs + j] = (tia[i + 1] - tia[i]) * bs + (*ia)[i * bs + j - 1];
1442     }
1443     if (n) (*ia)[n * bs] = (tia[n] - tia[n - 1]) * bs + (*ia)[n * bs - 1];
1444 
1445     if (inja) {
1446       PetscCall(PetscMalloc1(nz * bs * bs, ja));
1447       cnt = 0;
1448       for (i = 0; i < n; i++) {
1449         for (j = 0; j < bs; j++) {
1450           for (k = tia[i]; k < tia[i + 1]; k++) {
1451             for (l = 0; l < bs; l++) (*ja)[cnt++] = bs * tja[k] + l;
1452           }
1453         }
1454       }
1455     }
1456 
1457     if (symmetric) { /* deallocate memory allocated in MatToSymmetricIJ_SeqAIJ() */
1458       PetscCall(PetscFree(tia));
1459       PetscCall(PetscFree(tja));
1460     }
1461   } else if (oshift == 1) {
1462     if (symmetric) {
1463       nz = tia[A->rmap->n / bs];
1464       /*  add 1 to i and j indices */
1465       for (i = 0; i < A->rmap->n / bs + 1; i++) tia[i] = tia[i] + 1;
1466       *ia = tia;
1467       if (ja) {
1468         for (i = 0; i < nz; i++) tja[i] = tja[i] + 1;
1469         *ja = tja;
1470       }
1471     } else {
1472       nz = a->i[A->rmap->n / bs];
1473       /* malloc space and  add 1 to i and j indices */
1474       PetscCall(PetscMalloc1(A->rmap->n / bs + 1, ia));
1475       for (i = 0; i < A->rmap->n / bs + 1; i++) (*ia)[i] = a->i[i] + 1;
1476       if (ja) {
1477         PetscCall(PetscMalloc1(nz, ja));
1478         for (i = 0; i < nz; i++) (*ja)[i] = a->j[i] + 1;
1479       }
1480     }
1481   } else {
1482     *ia = tia;
1483     if (ja) *ja = tja;
1484   }
1485   PetscFunctionReturn(PETSC_SUCCESS);
1486 }
1487 
MatRestoreRowIJ_SeqBAIJ(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * nn,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)1488 static PetscErrorCode MatRestoreRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
1489 {
1490   PetscFunctionBegin;
1491   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
1492   if ((!blockcompressed && A->rmap->bs > 1) || (symmetric || oshift == 1)) {
1493     PetscCall(PetscFree(*ia));
1494     if (ja) PetscCall(PetscFree(*ja));
1495   }
1496   PetscFunctionReturn(PETSC_SUCCESS);
1497 }
1498 
MatDestroy_SeqBAIJ(Mat A)1499 PetscErrorCode MatDestroy_SeqBAIJ(Mat A)
1500 {
1501   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1502 
1503   PetscFunctionBegin;
1504   if (A->hash_active) {
1505     PetscInt bs;
1506     A->ops[0] = a->cops;
1507     PetscCall(PetscHMapIJVDestroy(&a->ht));
1508     PetscCall(MatGetBlockSize(A, &bs));
1509     if (bs > 1) PetscCall(PetscHSetIJDestroy(&a->bht));
1510     PetscCall(PetscFree(a->dnz));
1511     PetscCall(PetscFree(a->bdnz));
1512     A->hash_active = PETSC_FALSE;
1513   }
1514   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->N, A->cmap->n, a->nz));
1515   PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
1516   PetscCall(ISDestroy(&a->row));
1517   PetscCall(ISDestroy(&a->col));
1518   PetscCall(PetscFree(a->diag));
1519   PetscCall(PetscFree(a->idiag));
1520   if (a->free_imax_ilen) PetscCall(PetscFree2(a->imax, a->ilen));
1521   PetscCall(PetscFree(a->solve_work));
1522   PetscCall(PetscFree(a->mult_work));
1523   PetscCall(PetscFree(a->sor_workt));
1524   PetscCall(PetscFree(a->sor_work));
1525   PetscCall(ISDestroy(&a->icol));
1526   PetscCall(PetscFree(a->saved_values));
1527   PetscCall(PetscFree2(a->compressedrow.i, a->compressedrow.rindex));
1528 
1529   PetscCall(MatDestroy(&a->sbaijMat));
1530   PetscCall(MatDestroy(&a->parent));
1531   PetscCall(PetscFree(A->data));
1532 
1533   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
1534   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJGetArray_C", NULL));
1535   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJRestoreArray_C", NULL));
1536   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
1537   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
1538   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetColumnIndices_C", NULL));
1539   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqaij_C", NULL));
1540   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqsbaij_C", NULL));
1541   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocation_C", NULL));
1542   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocationCSR_C", NULL));
1543   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqbstrm_C", NULL));
1544   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatIsTranspose_C", NULL));
1545 #if defined(PETSC_HAVE_HYPRE)
1546   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_hypre_C", NULL));
1547 #endif
1548   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_is_C", NULL));
1549   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
1550   PetscFunctionReturn(PETSC_SUCCESS);
1551 }
1552 
MatSetOption_SeqBAIJ(Mat A,MatOption op,PetscBool flg)1553 static PetscErrorCode MatSetOption_SeqBAIJ(Mat A, MatOption op, PetscBool flg)
1554 {
1555   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1556 
1557   PetscFunctionBegin;
1558   switch (op) {
1559   case MAT_ROW_ORIENTED:
1560     a->roworiented = flg;
1561     break;
1562   case MAT_KEEP_NONZERO_PATTERN:
1563     a->keepnonzeropattern = flg;
1564     break;
1565   case MAT_NEW_NONZERO_LOCATIONS:
1566     a->nonew = (flg ? 0 : 1);
1567     break;
1568   case MAT_NEW_NONZERO_LOCATION_ERR:
1569     a->nonew = (flg ? -1 : 0);
1570     break;
1571   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1572     a->nonew = (flg ? -2 : 0);
1573     break;
1574   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1575     a->nounused = (flg ? -1 : 0);
1576     break;
1577   default:
1578     break;
1579   }
1580   PetscFunctionReturn(PETSC_SUCCESS);
1581 }
1582 
1583 /* used for both SeqBAIJ and SeqSBAIJ matrices */
MatGetRow_SeqBAIJ_private(Mat A,PetscInt row,PetscInt * nz,PetscInt ** idx,PetscScalar ** v,PetscInt * ai,PetscInt * aj,PetscScalar * aa)1584 PetscErrorCode MatGetRow_SeqBAIJ_private(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v, PetscInt *ai, PetscInt *aj, PetscScalar *aa)
1585 {
1586   PetscInt     itmp, i, j, k, M, bn, bp, *idx_i, bs, bs2;
1587   MatScalar   *aa_i;
1588   PetscScalar *v_i;
1589 
1590   PetscFunctionBegin;
1591   bs  = A->rmap->bs;
1592   bs2 = bs * bs;
1593   PetscCheck(row >= 0 && row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
1594 
1595   bn  = row / bs; /* Block number */
1596   bp  = row % bs; /* Block Position */
1597   M   = ai[bn + 1] - ai[bn];
1598   *nz = bs * M;
1599 
1600   if (v) {
1601     *v = NULL;
1602     if (*nz) {
1603       PetscCall(PetscMalloc1(*nz, v));
1604       for (i = 0; i < M; i++) { /* for each block in the block row */
1605         v_i  = *v + i * bs;
1606         aa_i = aa + bs2 * (ai[bn] + i);
1607         for (j = bp, k = 0; j < bs2; j += bs, k++) v_i[k] = aa_i[j];
1608       }
1609     }
1610   }
1611 
1612   if (idx) {
1613     *idx = NULL;
1614     if (*nz) {
1615       PetscCall(PetscMalloc1(*nz, idx));
1616       for (i = 0; i < M; i++) { /* for each block in the block row */
1617         idx_i = *idx + i * bs;
1618         itmp  = bs * aj[ai[bn] + i];
1619         for (j = 0; j < bs; j++) idx_i[j] = itmp++;
1620       }
1621     }
1622   }
1623   PetscFunctionReturn(PETSC_SUCCESS);
1624 }
1625 
MatGetRow_SeqBAIJ(Mat A,PetscInt row,PetscInt * nz,PetscInt ** idx,PetscScalar ** v)1626 PetscErrorCode MatGetRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1627 {
1628   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1629 
1630   PetscFunctionBegin;
1631   PetscCall(MatGetRow_SeqBAIJ_private(A, row, nz, idx, v, a->i, a->j, a->a));
1632   PetscFunctionReturn(PETSC_SUCCESS);
1633 }
1634 
MatRestoreRow_SeqBAIJ(Mat A,PetscInt row,PetscInt * nz,PetscInt ** idx,PetscScalar ** v)1635 PetscErrorCode MatRestoreRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1636 {
1637   PetscFunctionBegin;
1638   if (idx) PetscCall(PetscFree(*idx));
1639   if (v) PetscCall(PetscFree(*v));
1640   PetscFunctionReturn(PETSC_SUCCESS);
1641 }
1642 
MatTranspose_SeqBAIJ(Mat A,MatReuse reuse,Mat * B)1643 static PetscErrorCode MatTranspose_SeqBAIJ(Mat A, MatReuse reuse, Mat *B)
1644 {
1645   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *at;
1646   Mat          C;
1647   PetscInt     i, j, k, *aj = a->j, *ai = a->i, bs = A->rmap->bs, mbs = a->mbs, nbs = a->nbs, *atfill;
1648   PetscInt     bs2 = a->bs2, *ati, *atj, anzj, kr;
1649   MatScalar   *ata, *aa = a->a;
1650 
1651   PetscFunctionBegin;
1652   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
1653   PetscCall(PetscCalloc1(1 + nbs, &atfill));
1654   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
1655     for (i = 0; i < ai[mbs]; i++) atfill[aj[i]] += 1; /* count num of non-zeros in row aj[i] */
1656 
1657     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1658     PetscCall(MatSetSizes(C, A->cmap->n, A->rmap->N, A->cmap->n, A->rmap->N));
1659     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1660     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, atfill));
1661 
1662     at  = (Mat_SeqBAIJ *)C->data;
1663     ati = at->i;
1664     for (i = 0; i < nbs; i++) at->ilen[i] = at->imax[i] = ati[i + 1] - ati[i];
1665   } else {
1666     C   = *B;
1667     at  = (Mat_SeqBAIJ *)C->data;
1668     ati = at->i;
1669   }
1670 
1671   atj = at->j;
1672   ata = at->a;
1673 
1674   /* Copy ati into atfill so we have locations of the next free space in atj */
1675   PetscCall(PetscArraycpy(atfill, ati, nbs));
1676 
1677   /* Walk through A row-wise and mark nonzero entries of A^T. */
1678   for (i = 0; i < mbs; i++) {
1679     anzj = ai[i + 1] - ai[i];
1680     for (j = 0; j < anzj; j++) {
1681       atj[atfill[*aj]] = i;
1682       for (kr = 0; kr < bs; kr++) {
1683         for (k = 0; k < bs; k++) ata[bs2 * atfill[*aj] + k * bs + kr] = *aa++;
1684       }
1685       atfill[*aj++] += 1;
1686     }
1687   }
1688   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1689   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1690 
1691   /* Clean up temporary space and complete requests. */
1692   PetscCall(PetscFree(atfill));
1693 
1694   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1695     PetscCall(MatSetBlockSizes(C, A->cmap->bs, A->rmap->bs));
1696     *B = C;
1697   } else {
1698     PetscCall(MatHeaderMerge(A, &C));
1699   }
1700   PetscFunctionReturn(PETSC_SUCCESS);
1701 }
1702 
MatCompare_SeqBAIJ_Private(Mat A,Mat B,PetscReal tol,PetscBool * flg)1703 static PetscErrorCode MatCompare_SeqBAIJ_Private(Mat A, Mat B, PetscReal tol, PetscBool *flg)
1704 {
1705   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1706 
1707   PetscFunctionBegin;
1708   /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1709   if (A->rmap->N != B->rmap->N || A->cmap->n != B->cmap->n || A->rmap->bs != B->rmap->bs || a->nz != b->nz) {
1710     *flg = PETSC_FALSE;
1711     PetscFunctionReturn(PETSC_SUCCESS);
1712   }
1713 
1714   /* if the a->i are the same */
1715   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1716   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1717 
1718   /* if a->j are the same */
1719   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1720   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1721 
1722   if (tol == 0.0) PetscCall(PetscArraycmp(a->a, b->a, a->nz * A->rmap->bs * A->rmap->bs, flg)); /* if a->a are the same */
1723   else {
1724     *flg = PETSC_TRUE;
1725     for (PetscInt i = 0; (i < a->nz * A->rmap->bs * A->rmap->bs) && *flg; ++i)
1726       if (PetscAbsScalar(a->a[i] - b->a[i]) > tol) *flg = PETSC_FALSE;
1727   }
1728   PetscFunctionReturn(PETSC_SUCCESS);
1729 }
1730 
MatIsTranspose_SeqBAIJ(Mat A,Mat B,PetscReal tol,PetscBool * f)1731 static PetscErrorCode MatIsTranspose_SeqBAIJ(Mat A, Mat B, PetscReal tol, PetscBool *f)
1732 {
1733   Mat Btrans;
1734 
1735   PetscFunctionBegin;
1736   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &Btrans));
1737   PetscCall(MatCompare_SeqBAIJ_Private(A, Btrans, tol, f));
1738   PetscCall(MatDestroy(&Btrans));
1739   PetscFunctionReturn(PETSC_SUCCESS);
1740 }
1741 
MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)1742 static PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
1743 {
1744   PetscFunctionBegin;
1745   PetscCall(MatCompare_SeqBAIJ_Private(A, B, 0.0, flg));
1746   PetscFunctionReturn(PETSC_SUCCESS);
1747 }
1748 
1749 /* Used for both SeqBAIJ and SeqSBAIJ matrices */
MatView_SeqBAIJ_Binary(Mat mat,PetscViewer viewer)1750 PetscErrorCode MatView_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
1751 {
1752   Mat_SeqBAIJ *A = (Mat_SeqBAIJ *)mat->data;
1753   PetscInt     header[4], M, N, m, bs, nz, cnt, i, j, k, l;
1754   PetscInt    *rowlens, *colidxs;
1755   PetscScalar *matvals;
1756 
1757   PetscFunctionBegin;
1758   PetscCall(PetscViewerSetUp(viewer));
1759 
1760   M  = mat->rmap->N;
1761   N  = mat->cmap->N;
1762   m  = mat->rmap->n;
1763   bs = mat->rmap->bs;
1764   nz = bs * bs * A->nz;
1765 
1766   /* write matrix header */
1767   header[0] = MAT_FILE_CLASSID;
1768   header[1] = M;
1769   header[2] = N;
1770   header[3] = nz;
1771   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1772 
1773   /* store row lengths */
1774   PetscCall(PetscMalloc1(m, &rowlens));
1775   for (cnt = 0, i = 0; i < A->mbs; i++)
1776     for (j = 0; j < bs; j++) rowlens[cnt++] = bs * (A->i[i + 1] - A->i[i]);
1777   PetscCall(PetscViewerBinaryWrite(viewer, rowlens, m, PETSC_INT));
1778   PetscCall(PetscFree(rowlens));
1779 
1780   /* store column indices  */
1781   PetscCall(PetscMalloc1(nz, &colidxs));
1782   for (cnt = 0, i = 0; i < A->mbs; i++)
1783     for (k = 0; k < bs; k++)
1784       for (j = A->i[i]; j < A->i[i + 1]; j++)
1785         for (l = 0; l < bs; l++) colidxs[cnt++] = bs * A->j[j] + l;
1786   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz);
1787   PetscCall(PetscViewerBinaryWrite(viewer, colidxs, nz, PETSC_INT));
1788   PetscCall(PetscFree(colidxs));
1789 
1790   /* store nonzero values */
1791   PetscCall(PetscMalloc1(nz, &matvals));
1792   for (cnt = 0, i = 0; i < A->mbs; i++)
1793     for (k = 0; k < bs; k++)
1794       for (j = A->i[i]; j < A->i[i + 1]; j++)
1795         for (l = 0; l < bs; l++) matvals[cnt++] = A->a[bs * (bs * j + l) + k];
1796   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz);
1797   PetscCall(PetscViewerBinaryWrite(viewer, matvals, nz, PETSC_SCALAR));
1798   PetscCall(PetscFree(matvals));
1799 
1800   /* write block size option to the viewer's .info file */
1801   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1802   PetscFunctionReturn(PETSC_SUCCESS);
1803 }
1804 
MatView_SeqBAIJ_ASCII_structonly(Mat A,PetscViewer viewer)1805 static PetscErrorCode MatView_SeqBAIJ_ASCII_structonly(Mat A, PetscViewer viewer)
1806 {
1807   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1808   PetscInt     i, bs = A->rmap->bs, k;
1809 
1810   PetscFunctionBegin;
1811   PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1812   for (i = 0; i < a->mbs; i++) {
1813     PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT "-%" PetscInt_FMT ":", i * bs, i * bs + bs - 1));
1814     for (k = a->i[i]; k < a->i[i + 1]; k++) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT "-%" PetscInt_FMT ") ", bs * a->j[k], bs * a->j[k] + bs - 1));
1815     PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1816   }
1817   PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1818   PetscFunctionReturn(PETSC_SUCCESS);
1819 }
1820 
MatView_SeqBAIJ_ASCII(Mat A,PetscViewer viewer)1821 static PetscErrorCode MatView_SeqBAIJ_ASCII(Mat A, PetscViewer viewer)
1822 {
1823   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1824   PetscInt          i, j, bs = A->rmap->bs, k, l, bs2 = a->bs2;
1825   PetscViewerFormat format;
1826 
1827   PetscFunctionBegin;
1828   if (A->structure_only) {
1829     PetscCall(MatView_SeqBAIJ_ASCII_structonly(A, viewer));
1830     PetscFunctionReturn(PETSC_SUCCESS);
1831   }
1832 
1833   PetscCall(PetscViewerGetFormat(viewer, &format));
1834   if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1835   } else if (format == PETSC_VIEWER_ASCII_MATLAB) {
1836     const char *matname;
1837     Mat         aij;
1838     PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &aij));
1839     PetscCall(PetscObjectGetName((PetscObject)A, &matname));
1840     PetscCall(PetscObjectSetName((PetscObject)aij, matname));
1841     PetscCall(MatView(aij, viewer));
1842     PetscCall(MatDestroy(&aij));
1843   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1844     PetscFunctionReturn(PETSC_SUCCESS);
1845   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1846     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1847     for (i = 0; i < a->mbs; i++) {
1848       for (j = 0; j < bs; j++) {
1849         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j));
1850         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1851           for (l = 0; l < bs; l++) {
1852 #if defined(PETSC_USE_COMPLEX)
1853             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1854               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1855             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1856               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1857             } else if (PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1858               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j])));
1859             }
1860 #else
1861             if (a->a[bs2 * k + l * bs + j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]));
1862 #endif
1863           }
1864         }
1865         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1866       }
1867     }
1868     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1869   } else {
1870     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1871     for (i = 0; i < a->mbs; i++) {
1872       for (j = 0; j < bs; j++) {
1873         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j));
1874         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1875           for (l = 0; l < bs; l++) {
1876 #if defined(PETSC_USE_COMPLEX)
1877             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0) {
1878               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1879             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0) {
1880               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1881             } else {
1882               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j])));
1883             }
1884 #else
1885             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]));
1886 #endif
1887           }
1888         }
1889         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1890       }
1891     }
1892     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1893   }
1894   PetscCall(PetscViewerFlush(viewer));
1895   PetscFunctionReturn(PETSC_SUCCESS);
1896 }
1897 
1898 #include <petscdraw.h>
MatView_SeqBAIJ_Draw_Zoom(PetscDraw draw,void * Aa)1899 static PetscErrorCode MatView_SeqBAIJ_Draw_Zoom(PetscDraw draw, void *Aa)
1900 {
1901   Mat               A = (Mat)Aa;
1902   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1903   PetscInt          row, i, j, k, l, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2;
1904   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1905   MatScalar        *aa;
1906   PetscViewer       viewer;
1907   PetscViewerFormat format;
1908   int               color;
1909 
1910   PetscFunctionBegin;
1911   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1912   PetscCall(PetscViewerGetFormat(viewer, &format));
1913   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1914 
1915   /* loop over matrix elements drawing boxes */
1916 
1917   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1918     PetscDrawCollectiveBegin(draw);
1919     /* Blue for negative, Cyan for zero and  Red for positive */
1920     color = PETSC_DRAW_BLUE;
1921     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1922       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1923         y_l = A->rmap->N - row - 1.0;
1924         y_r = y_l + 1.0;
1925         x_l = a->j[j] * bs;
1926         x_r = x_l + 1.0;
1927         aa  = a->a + j * bs2;
1928         for (k = 0; k < bs; k++) {
1929           for (l = 0; l < bs; l++) {
1930             if (PetscRealPart(*aa++) >= 0.) continue;
1931             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1932           }
1933         }
1934       }
1935     }
1936     color = PETSC_DRAW_CYAN;
1937     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1938       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1939         y_l = A->rmap->N - row - 1.0;
1940         y_r = y_l + 1.0;
1941         x_l = a->j[j] * bs;
1942         x_r = x_l + 1.0;
1943         aa  = a->a + j * bs2;
1944         for (k = 0; k < bs; k++) {
1945           for (l = 0; l < bs; l++) {
1946             if (PetscRealPart(*aa++) != 0.) continue;
1947             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1948           }
1949         }
1950       }
1951     }
1952     color = PETSC_DRAW_RED;
1953     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1954       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1955         y_l = A->rmap->N - row - 1.0;
1956         y_r = y_l + 1.0;
1957         x_l = a->j[j] * bs;
1958         x_r = x_l + 1.0;
1959         aa  = a->a + j * bs2;
1960         for (k = 0; k < bs; k++) {
1961           for (l = 0; l < bs; l++) {
1962             if (PetscRealPart(*aa++) <= 0.) continue;
1963             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1964           }
1965         }
1966       }
1967     }
1968     PetscDrawCollectiveEnd(draw);
1969   } else {
1970     /* use contour shading to indicate magnitude of values */
1971     /* first determine max of all nonzero values */
1972     PetscReal minv = 0.0, maxv = 0.0;
1973     PetscDraw popup;
1974 
1975     for (i = 0; i < a->nz * a->bs2; i++) {
1976       if (PetscAbsScalar(a->a[i]) > maxv) maxv = PetscAbsScalar(a->a[i]);
1977     }
1978     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1979     PetscCall(PetscDrawGetPopup(draw, &popup));
1980     PetscCall(PetscDrawScalePopup(popup, 0.0, maxv));
1981 
1982     PetscDrawCollectiveBegin(draw);
1983     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1984       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1985         y_l = A->rmap->N - row - 1.0;
1986         y_r = y_l + 1.0;
1987         x_l = a->j[j] * bs;
1988         x_r = x_l + 1.0;
1989         aa  = a->a + j * bs2;
1990         for (k = 0; k < bs; k++) {
1991           for (l = 0; l < bs; l++) {
1992             MatScalar v = *aa++;
1993             color       = PetscDrawRealToColor(PetscAbsScalar(v), minv, maxv);
1994             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1995           }
1996         }
1997       }
1998     }
1999     PetscDrawCollectiveEnd(draw);
2000   }
2001   PetscFunctionReturn(PETSC_SUCCESS);
2002 }
2003 
MatView_SeqBAIJ_Draw(Mat A,PetscViewer viewer)2004 static PetscErrorCode MatView_SeqBAIJ_Draw(Mat A, PetscViewer viewer)
2005 {
2006   PetscReal xl, yl, xr, yr, w, h;
2007   PetscDraw draw;
2008   PetscBool isnull;
2009 
2010   PetscFunctionBegin;
2011   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
2012   PetscCall(PetscDrawIsNull(draw, &isnull));
2013   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
2014 
2015   xr = A->cmap->n;
2016   yr = A->rmap->N;
2017   h  = yr / 10.0;
2018   w  = xr / 10.0;
2019   xr += w;
2020   yr += h;
2021   xl = -w;
2022   yl = -h;
2023   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
2024   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
2025   PetscCall(PetscDrawZoom(draw, MatView_SeqBAIJ_Draw_Zoom, A));
2026   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
2027   PetscCall(PetscDrawSave(draw));
2028   PetscFunctionReturn(PETSC_SUCCESS);
2029 }
2030 
MatView_SeqBAIJ(Mat A,PetscViewer viewer)2031 PetscErrorCode MatView_SeqBAIJ(Mat A, PetscViewer viewer)
2032 {
2033   PetscBool isascii, isbinary, isdraw;
2034 
2035   PetscFunctionBegin;
2036   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
2037   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
2038   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
2039   if (isascii) {
2040     PetscCall(MatView_SeqBAIJ_ASCII(A, viewer));
2041   } else if (isbinary) {
2042     PetscCall(MatView_SeqBAIJ_Binary(A, viewer));
2043   } else if (isdraw) {
2044     PetscCall(MatView_SeqBAIJ_Draw(A, viewer));
2045   } else {
2046     Mat B;
2047     PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &B));
2048     PetscCall(MatView(B, viewer));
2049     PetscCall(MatDestroy(&B));
2050   }
2051   PetscFunctionReturn(PETSC_SUCCESS);
2052 }
2053 
MatGetValues_SeqBAIJ(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])2054 PetscErrorCode MatGetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
2055 {
2056   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2057   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
2058   PetscInt    *ai = a->i, *ailen = a->ilen;
2059   PetscInt     brow, bcol, ridx, cidx, bs = A->rmap->bs, bs2 = a->bs2;
2060   MatScalar   *ap, *aa = a->a;
2061 
2062   PetscFunctionBegin;
2063   for (k = 0; k < m; k++) { /* loop over rows */
2064     row  = im[k];
2065     brow = row / bs;
2066     if (row < 0) {
2067       v += n;
2068       continue;
2069     } /* negative row */
2070     PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " too large", row);
2071     rp   = PetscSafePointerPlusOffset(aj, ai[brow]);
2072     ap   = PetscSafePointerPlusOffset(aa, bs2 * ai[brow]);
2073     nrow = ailen[brow];
2074     for (l = 0; l < n; l++) { /* loop over columns */
2075       if (in[l] < 0) {
2076         v++;
2077         continue;
2078       } /* negative column */
2079       PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column %" PetscInt_FMT " too large", in[l]);
2080       col  = in[l];
2081       bcol = col / bs;
2082       cidx = col % bs;
2083       ridx = row % bs;
2084       high = nrow;
2085       low  = 0; /* assume unsorted */
2086       while (high - low > 5) {
2087         t = (low + high) / 2;
2088         if (rp[t] > bcol) high = t;
2089         else low = t;
2090       }
2091       for (i = low; i < high; i++) {
2092         if (rp[i] > bcol) break;
2093         if (rp[i] == bcol) {
2094           *v++ = ap[bs2 * i + bs * cidx + ridx];
2095           goto finished;
2096         }
2097       }
2098       *v++ = 0.0;
2099     finished:;
2100     }
2101   }
2102   PetscFunctionReturn(PETSC_SUCCESS);
2103 }
2104 
MatSetValuesBlocked_SeqBAIJ(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)2105 PetscErrorCode MatSetValuesBlocked_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2106 {
2107   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2108   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, rmax, N, lastcol = -1;
2109   PetscInt          *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2110   PetscInt          *aj = a->j, nonew = a->nonew, bs2 = a->bs2, bs = A->rmap->bs, stepval;
2111   PetscBool          roworiented = a->roworiented;
2112   const PetscScalar *value       = v;
2113   MatScalar         *ap = NULL, *aa = a->a, *bap;
2114 
2115   PetscFunctionBegin;
2116   if (roworiented) {
2117     stepval = (n - 1) * bs;
2118   } else {
2119     stepval = (m - 1) * bs;
2120   }
2121   for (k = 0; k < m; k++) { /* loop over added rows */
2122     row = im[k];
2123     if (row < 0) continue;
2124     PetscCheck(row < a->mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block row index too large %" PetscInt_FMT " max %" PetscInt_FMT, row, a->mbs - 1);
2125     rp = aj + ai[row];
2126     if (!A->structure_only) ap = aa + bs2 * ai[row];
2127     rmax = imax[row];
2128     nrow = ailen[row];
2129     low  = 0;
2130     high = nrow;
2131     for (l = 0; l < n; l++) { /* loop over added columns */
2132       if (in[l] < 0) continue;
2133       PetscCheck(in[l] < a->nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block column index too large %" PetscInt_FMT " max %" PetscInt_FMT, in[l], a->nbs - 1);
2134       col = in[l];
2135       if (!A->structure_only) {
2136         if (roworiented) {
2137           value = v + (k * (stepval + bs) + l) * bs;
2138         } else {
2139           value = v + (l * (stepval + bs) + k) * bs;
2140         }
2141       }
2142       if (col <= lastcol) low = 0;
2143       else high = nrow;
2144       lastcol = col;
2145       while (high - low > 7) {
2146         t = (low + high) / 2;
2147         if (rp[t] > col) high = t;
2148         else low = t;
2149       }
2150       for (i = low; i < high; i++) {
2151         if (rp[i] > col) break;
2152         if (rp[i] == col) {
2153           if (A->structure_only) goto noinsert2;
2154           bap = ap + bs2 * i;
2155           if (roworiented) {
2156             if (is == ADD_VALUES) {
2157               for (ii = 0; ii < bs; ii++, value += stepval) {
2158                 for (jj = ii; jj < bs2; jj += bs) bap[jj] += *value++;
2159               }
2160             } else {
2161               for (ii = 0; ii < bs; ii++, value += stepval) {
2162                 for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2163               }
2164             }
2165           } else {
2166             if (is == ADD_VALUES) {
2167               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2168                 for (jj = 0; jj < bs; jj++) bap[jj] += value[jj];
2169                 bap += bs;
2170               }
2171             } else {
2172               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2173                 for (jj = 0; jj < bs; jj++) bap[jj] = value[jj];
2174                 bap += bs;
2175               }
2176             }
2177           }
2178           goto noinsert2;
2179         }
2180       }
2181       if (nonew == 1) goto noinsert2;
2182       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new blocked index new nonzero block (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
2183       if (A->structure_only) {
2184         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, row, col, rmax, ai, aj, rp, imax, nonew, MatScalar);
2185       } else {
2186         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, row, col, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2187       }
2188       N = nrow++ - 1;
2189       high++;
2190       /* shift up all the later entries in this row */
2191       PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
2192       rp[i] = col;
2193       if (!A->structure_only) {
2194         PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1)));
2195         bap = ap + bs2 * i;
2196         if (roworiented) {
2197           for (ii = 0; ii < bs; ii++, value += stepval) {
2198             for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2199           }
2200         } else {
2201           for (ii = 0; ii < bs; ii++, value += stepval) {
2202             for (jj = 0; jj < bs; jj++) *bap++ = *value++;
2203           }
2204         }
2205       }
2206     noinsert2:;
2207       low = i;
2208     }
2209     ailen[row] = nrow;
2210   }
2211   PetscFunctionReturn(PETSC_SUCCESS);
2212 }
2213 
MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)2214 PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat A, MatAssemblyType mode)
2215 {
2216   Mat_SeqBAIJ *a      = (Mat_SeqBAIJ *)A->data;
2217   PetscInt     fshift = 0, i, *ai = a->i, *aj = a->j, *imax = a->imax;
2218   PetscInt     m = A->rmap->N, *ip, N, *ailen = a->ilen;
2219   PetscInt     mbs = a->mbs, bs2 = a->bs2, rmax = 0;
2220   MatScalar   *aa    = a->a, *ap;
2221   PetscReal    ratio = 0.6;
2222 
2223   PetscFunctionBegin;
2224   if (mode == MAT_FLUSH_ASSEMBLY || (A->was_assembled && A->ass_nonzerostate == A->nonzerostate)) PetscFunctionReturn(PETSC_SUCCESS);
2225 
2226   if (m) rmax = ailen[0];
2227   for (i = 1; i < mbs; i++) {
2228     /* move each row back by the amount of empty slots (fshift) before it*/
2229     fshift += imax[i - 1] - ailen[i - 1];
2230     rmax = PetscMax(rmax, ailen[i]);
2231     if (fshift) {
2232       ip = aj + ai[i];
2233       ap = aa + bs2 * ai[i];
2234       N  = ailen[i];
2235       PetscCall(PetscArraymove(ip - fshift, ip, N));
2236       if (!A->structure_only) PetscCall(PetscArraymove(ap - bs2 * fshift, ap, bs2 * N));
2237     }
2238     ai[i] = ai[i - 1] + ailen[i - 1];
2239   }
2240   if (mbs) {
2241     fshift += imax[mbs - 1] - ailen[mbs - 1];
2242     ai[mbs] = ai[mbs - 1] + ailen[mbs - 1];
2243   }
2244 
2245   /* reset ilen and imax for each row */
2246   a->nonzerorowcnt = 0;
2247   if (A->structure_only) {
2248     PetscCall(PetscFree2(a->imax, a->ilen));
2249   } else { /* !A->structure_only */
2250     for (i = 0; i < mbs; i++) {
2251       ailen[i] = imax[i] = ai[i + 1] - ai[i];
2252       a->nonzerorowcnt += ((ai[i + 1] - ai[i]) > 0);
2253     }
2254   }
2255   a->nz = ai[mbs];
2256 
2257   /* diagonals may have moved, so kill the diagonal pointers */
2258   a->idiagvalid = PETSC_FALSE;
2259   if (fshift) PetscCheck(a->nounused != -1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unused space detected in matrix: %" PetscInt_FMT " X %" PetscInt_FMT " block size %" PetscInt_FMT ", %" PetscInt_FMT " unneeded", m, A->cmap->n, A->rmap->bs, fshift * bs2);
2260   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT ", block size %" PetscInt_FMT "; storage space: %" PetscInt_FMT " unneeded, %" PetscInt_FMT " used\n", m, A->cmap->n, A->rmap->bs, fshift * bs2, a->nz * bs2));
2261   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues is %" PetscInt_FMT "\n", a->reallocs));
2262   PetscCall(PetscInfo(A, "Most nonzeros blocks in any row is %" PetscInt_FMT "\n", rmax));
2263 
2264   A->info.mallocs += a->reallocs;
2265   a->reallocs         = 0;
2266   A->info.nz_unneeded = (PetscReal)fshift * bs2;
2267   a->rmax             = rmax;
2268 
2269   if (!A->structure_only) PetscCall(MatCheckCompressedRow(A, a->nonzerorowcnt, &a->compressedrow, a->i, mbs, ratio));
2270   PetscFunctionReturn(PETSC_SUCCESS);
2271 }
2272 
2273 /*
2274    This function returns an array of flags which indicate the locations of contiguous
2275    blocks that should be zeroed. for eg: if bs = 3  and is = [0,1,2,3,5,6,7,8,9]
2276    then the resulting sizes = [3,1,1,3,1] corresponding to sets [(0,1,2),(3),(5),(6,7,8),(9)]
2277    Assume: sizes should be long enough to hold all the values.
2278 */
MatZeroRows_SeqBAIJ_Check_Blocks(PetscInt idx[],PetscInt n,PetscInt bs,PetscInt sizes[],PetscInt * bs_max)2279 static PetscErrorCode MatZeroRows_SeqBAIJ_Check_Blocks(PetscInt idx[], PetscInt n, PetscInt bs, PetscInt sizes[], PetscInt *bs_max)
2280 {
2281   PetscInt j = 0;
2282 
2283   PetscFunctionBegin;
2284   for (PetscInt i = 0; i < n; j++) {
2285     PetscInt row = idx[i];
2286     if (row % bs != 0) { /* Not the beginning of a block */
2287       sizes[j] = 1;
2288       i++;
2289     } else if (i + bs > n) { /* complete block doesn't exist (at idx end) */
2290       sizes[j] = 1;          /* Also makes sure at least 'bs' values exist for next else */
2291       i++;
2292     } else { /* Beginning of the block, so check if the complete block exists */
2293       PetscBool flg = PETSC_TRUE;
2294       for (PetscInt k = 1; k < bs; k++) {
2295         if (row + k != idx[i + k]) { /* break in the block */
2296           flg = PETSC_FALSE;
2297           break;
2298         }
2299       }
2300       if (flg) { /* No break in the bs */
2301         sizes[j] = bs;
2302         i += bs;
2303       } else {
2304         sizes[j] = 1;
2305         i++;
2306       }
2307     }
2308   }
2309   *bs_max = j;
2310   PetscFunctionReturn(PETSC_SUCCESS);
2311 }
2312 
MatZeroRows_SeqBAIJ(Mat A,PetscInt is_n,const PetscInt is_idx[],PetscScalar diag,Vec x,Vec b)2313 PetscErrorCode MatZeroRows_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2314 {
2315   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2316   PetscInt           i, j, k, count, *rows;
2317   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, *sizes, row, bs_max;
2318   PetscScalar        zero = 0.0;
2319   MatScalar         *aa;
2320   const PetscScalar *xx;
2321   PetscScalar       *bb;
2322 
2323   PetscFunctionBegin;
2324   /* fix right-hand side if needed */
2325   if (x && b) {
2326     PetscCall(VecGetArrayRead(x, &xx));
2327     PetscCall(VecGetArray(b, &bb));
2328     for (i = 0; i < is_n; i++) bb[is_idx[i]] = diag * xx[is_idx[i]];
2329     PetscCall(VecRestoreArrayRead(x, &xx));
2330     PetscCall(VecRestoreArray(b, &bb));
2331   }
2332 
2333   /* Make a copy of the IS and  sort it */
2334   /* allocate memory for rows,sizes */
2335   PetscCall(PetscMalloc2(is_n, &rows, 2 * is_n, &sizes));
2336 
2337   /* copy IS values to rows, and sort them */
2338   for (i = 0; i < is_n; i++) rows[i] = is_idx[i];
2339   PetscCall(PetscSortInt(is_n, rows));
2340 
2341   if (baij->keepnonzeropattern) {
2342     for (i = 0; i < is_n; i++) sizes[i] = 1;
2343     bs_max = is_n;
2344   } else {
2345     PetscCall(MatZeroRows_SeqBAIJ_Check_Blocks(rows, is_n, bs, sizes, &bs_max));
2346     A->nonzerostate++;
2347   }
2348 
2349   for (i = 0, j = 0; i < bs_max; j += sizes[i], i++) {
2350     row = rows[j];
2351     PetscCheck(row >= 0 && row <= A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", row);
2352     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2353     aa    = baij->a + baij->i[row / bs] * bs2 + (row % bs);
2354     if (sizes[i] == bs && !baij->keepnonzeropattern) {
2355       if (diag != (PetscScalar)0.0) {
2356         if (baij->ilen[row / bs] > 0) {
2357           baij->ilen[row / bs]       = 1;
2358           baij->j[baij->i[row / bs]] = row / bs;
2359 
2360           PetscCall(PetscArrayzero(aa, count * bs));
2361         }
2362         /* Now insert all the diagonal values for this bs */
2363         for (k = 0; k < bs; k++) PetscUseTypeMethod(A, setvalues, 1, rows + j + k, 1, rows + j + k, &diag, INSERT_VALUES);
2364       } else { /* (diag == 0.0) */
2365         baij->ilen[row / bs] = 0;
2366       } /* end (diag == 0.0) */
2367     } else { /* (sizes[i] != bs) */
2368       PetscAssert(sizes[i] == 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal Error. Value should be 1");
2369       for (k = 0; k < count; k++) {
2370         aa[0] = zero;
2371         aa += bs;
2372       }
2373       if (diag != (PetscScalar)0.0) PetscUseTypeMethod(A, setvalues, 1, rows + j, 1, rows + j, &diag, INSERT_VALUES);
2374     }
2375   }
2376 
2377   PetscCall(PetscFree2(rows, sizes));
2378   PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY));
2379   PetscFunctionReturn(PETSC_SUCCESS);
2380 }
2381 
MatZeroRowsColumns_SeqBAIJ(Mat A,PetscInt is_n,const PetscInt is_idx[],PetscScalar diag,Vec x,Vec b)2382 static PetscErrorCode MatZeroRowsColumns_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2383 {
2384   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2385   PetscInt           i, j, k, count;
2386   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, row, col;
2387   PetscScalar        zero = 0.0;
2388   MatScalar         *aa;
2389   const PetscScalar *xx;
2390   PetscScalar       *bb;
2391   PetscBool         *zeroed, vecs = PETSC_FALSE;
2392 
2393   PetscFunctionBegin;
2394   /* fix right-hand side if needed */
2395   if (x && b) {
2396     PetscCall(VecGetArrayRead(x, &xx));
2397     PetscCall(VecGetArray(b, &bb));
2398     vecs = PETSC_TRUE;
2399   }
2400 
2401   /* zero the columns */
2402   PetscCall(PetscCalloc1(A->rmap->n, &zeroed));
2403   for (i = 0; i < is_n; i++) {
2404     PetscCheck(is_idx[i] >= 0 && is_idx[i] < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", is_idx[i]);
2405     zeroed[is_idx[i]] = PETSC_TRUE;
2406   }
2407   for (i = 0; i < A->rmap->N; i++) {
2408     if (!zeroed[i]) {
2409       row = i / bs;
2410       for (j = baij->i[row]; j < baij->i[row + 1]; j++) {
2411         for (k = 0; k < bs; k++) {
2412           col = bs * baij->j[j] + k;
2413           if (zeroed[col]) {
2414             aa = baij->a + j * bs2 + (i % bs) + bs * k;
2415             if (vecs) bb[i] -= aa[0] * xx[col];
2416             aa[0] = 0.0;
2417           }
2418         }
2419       }
2420     } else if (vecs) bb[i] = diag * xx[i];
2421   }
2422   PetscCall(PetscFree(zeroed));
2423   if (vecs) {
2424     PetscCall(VecRestoreArrayRead(x, &xx));
2425     PetscCall(VecRestoreArray(b, &bb));
2426   }
2427 
2428   /* zero the rows */
2429   for (i = 0; i < is_n; i++) {
2430     row   = is_idx[i];
2431     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2432     aa    = baij->a + baij->i[row / bs] * bs2 + (row % bs);
2433     for (k = 0; k < count; k++) {
2434       aa[0] = zero;
2435       aa += bs;
2436     }
2437     if (diag != (PetscScalar)0.0) PetscUseTypeMethod(A, setvalues, 1, &row, 1, &row, &diag, INSERT_VALUES);
2438   }
2439   PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY));
2440   PetscFunctionReturn(PETSC_SUCCESS);
2441 }
2442 
MatSetValues_SeqBAIJ(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)2443 PetscErrorCode MatSetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2444 {
2445   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2446   PetscInt    *rp, k, low, high, t, ii, row, nrow, i, col, l, rmax, N, lastcol = -1;
2447   PetscInt    *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2448   PetscInt    *aj = a->j, nonew = a->nonew, bs = A->rmap->bs, brow, bcol;
2449   PetscInt     ridx, cidx, bs2                 = a->bs2;
2450   PetscBool    roworiented = a->roworiented;
2451   MatScalar   *ap = NULL, value = 0.0, *aa = a->a, *bap;
2452 
2453   PetscFunctionBegin;
2454   for (k = 0; k < m; k++) { /* loop over added rows */
2455     row  = im[k];
2456     brow = row / bs;
2457     if (row < 0) continue;
2458     PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->N - 1);
2459     rp = PetscSafePointerPlusOffset(aj, ai[brow]);
2460     if (!A->structure_only) ap = PetscSafePointerPlusOffset(aa, bs2 * ai[brow]);
2461     rmax = imax[brow];
2462     nrow = ailen[brow];
2463     low  = 0;
2464     high = nrow;
2465     for (l = 0; l < n; l++) { /* loop over added columns */
2466       if (in[l] < 0) continue;
2467       PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[l], A->cmap->n - 1);
2468       col  = in[l];
2469       bcol = col / bs;
2470       ridx = row % bs;
2471       cidx = col % bs;
2472       if (!A->structure_only) {
2473         if (roworiented) {
2474           value = v[l + k * n];
2475         } else {
2476           value = v[k + l * m];
2477         }
2478       }
2479       if (col <= lastcol) low = 0;
2480       else high = nrow;
2481       lastcol = col;
2482       while (high - low > 7) {
2483         t = (low + high) / 2;
2484         if (rp[t] > bcol) high = t;
2485         else low = t;
2486       }
2487       for (i = low; i < high; i++) {
2488         if (rp[i] > bcol) break;
2489         if (rp[i] == bcol) {
2490           bap = PetscSafePointerPlusOffset(ap, bs2 * i + bs * cidx + ridx);
2491           if (!A->structure_only) {
2492             if (is == ADD_VALUES) *bap += value;
2493             else *bap = value;
2494           }
2495           goto noinsert1;
2496         }
2497       }
2498       if (nonew == 1) goto noinsert1;
2499       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
2500       if (A->structure_only) {
2501         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, brow, bcol, rmax, ai, aj, rp, imax, nonew, MatScalar);
2502       } else {
2503         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, brow, bcol, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2504       }
2505       N = nrow++ - 1;
2506       high++;
2507       /* shift up all the later entries in this row */
2508       PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
2509       rp[i] = bcol;
2510       if (!A->structure_only) {
2511         PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1)));
2512         PetscCall(PetscArrayzero(ap + bs2 * i, bs2));
2513         ap[bs2 * i + bs * cidx + ridx] = value;
2514       }
2515       a->nz++;
2516     noinsert1:;
2517       low = i;
2518     }
2519     ailen[brow] = nrow;
2520   }
2521   PetscFunctionReturn(PETSC_SUCCESS);
2522 }
2523 
MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,const MatFactorInfo * info)2524 static PetscErrorCode MatILUFactor_SeqBAIJ(Mat inA, IS row, IS col, const MatFactorInfo *info)
2525 {
2526   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2527   Mat          outA;
2528   PetscBool    row_identity, col_identity;
2529 
2530   PetscFunctionBegin;
2531   PetscCheck(info->levels == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only levels = 0 supported for in-place ILU");
2532   PetscCall(ISIdentity(row, &row_identity));
2533   PetscCall(ISIdentity(col, &col_identity));
2534   PetscCheck(row_identity && col_identity, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column permutations must be identity for in-place ILU");
2535 
2536   outA            = inA;
2537   inA->factortype = MAT_FACTOR_LU;
2538   PetscCall(PetscFree(inA->solvertype));
2539   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &inA->solvertype));
2540 
2541   PetscCall(PetscObjectReference((PetscObject)row));
2542   PetscCall(ISDestroy(&a->row));
2543   a->row = row;
2544   PetscCall(PetscObjectReference((PetscObject)col));
2545   PetscCall(ISDestroy(&a->col));
2546   a->col = col;
2547 
2548   /* Create the invert permutation so that it can be used in MatLUFactorNumeric() */
2549   PetscCall(ISDestroy(&a->icol));
2550   PetscCall(ISInvertPermutation(col, PETSC_DECIDE, &a->icol));
2551 
2552   PetscCall(MatSeqBAIJSetNumericFactorization_inplace(inA, (PetscBool)(row_identity && col_identity)));
2553   if (!a->solve_work) PetscCall(PetscMalloc1(inA->rmap->N + inA->rmap->bs, &a->solve_work));
2554   PetscCall(MatLUFactorNumeric(outA, inA, info));
2555   PetscFunctionReturn(PETSC_SUCCESS);
2556 }
2557 
MatSeqBAIJSetColumnIndices_SeqBAIJ(Mat mat,const PetscInt * indices)2558 static PetscErrorCode MatSeqBAIJSetColumnIndices_SeqBAIJ(Mat mat, const PetscInt *indices)
2559 {
2560   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)mat->data;
2561 
2562   PetscFunctionBegin;
2563   baij->nz = baij->maxnz;
2564   PetscCall(PetscArraycpy(baij->j, indices, baij->nz));
2565   PetscCall(PetscArraycpy(baij->ilen, baij->imax, baij->mbs));
2566   PetscFunctionReturn(PETSC_SUCCESS);
2567 }
2568 
2569 /*@
2570   MatSeqBAIJSetColumnIndices - Set the column indices for all the block rows in the matrix.
2571 
2572   Input Parameters:
2573 + mat     - the `MATSEQBAIJ` matrix
2574 - indices - the block column indices
2575 
2576   Level: advanced
2577 
2578   Notes:
2579   This can be called if you have precomputed the nonzero structure of the
2580   matrix and want to provide it to the matrix object to improve the performance
2581   of the `MatSetValues()` operation.
2582 
2583   You MUST have set the correct numbers of nonzeros per row in the call to
2584   `MatCreateSeqBAIJ()`, and the columns indices MUST be sorted.
2585 
2586   MUST be called before any calls to `MatSetValues()`
2587 
2588 .seealso: [](ch_matrices), `Mat`, `MATSEQBAIJ`, `MatSetValues()`
2589 @*/
MatSeqBAIJSetColumnIndices(Mat mat,PetscInt * indices)2590 PetscErrorCode MatSeqBAIJSetColumnIndices(Mat mat, PetscInt *indices)
2591 {
2592   PetscFunctionBegin;
2593   PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
2594   PetscAssertPointer(indices, 2);
2595   PetscUseMethod(mat, "MatSeqBAIJSetColumnIndices_C", (Mat, const PetscInt *), (mat, (const PetscInt *)indices));
2596   PetscFunctionReturn(PETSC_SUCCESS);
2597 }
2598 
MatGetRowMaxAbs_SeqBAIJ(Mat A,Vec v,PetscInt idx[])2599 static PetscErrorCode MatGetRowMaxAbs_SeqBAIJ(Mat A, Vec v, PetscInt idx[])
2600 {
2601   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2602   PetscInt     i, j, n, row, bs, *ai, *aj, mbs;
2603   PetscReal    atmp;
2604   PetscScalar *x, zero = 0.0;
2605   MatScalar   *aa;
2606   PetscInt     ncols, brow, krow, kcol;
2607 
2608   PetscFunctionBegin;
2609   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2610   bs  = A->rmap->bs;
2611   aa  = a->a;
2612   ai  = a->i;
2613   aj  = a->j;
2614   mbs = a->mbs;
2615 
2616   PetscCall(VecSet(v, zero));
2617   PetscCall(VecGetArray(v, &x));
2618   PetscCall(VecGetLocalSize(v, &n));
2619   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
2620   for (i = 0; i < mbs; i++) {
2621     ncols = ai[1] - ai[0];
2622     ai++;
2623     brow = bs * i;
2624     for (j = 0; j < ncols; j++) {
2625       for (kcol = 0; kcol < bs; kcol++) {
2626         for (krow = 0; krow < bs; krow++) {
2627           atmp = PetscAbsScalar(*aa);
2628           aa++;
2629           row = brow + krow; /* row index */
2630           if (PetscAbsScalar(x[row]) < atmp) {
2631             x[row] = atmp;
2632             if (idx) idx[row] = bs * (*aj) + kcol;
2633           }
2634         }
2635       }
2636       aj++;
2637     }
2638   }
2639   PetscCall(VecRestoreArray(v, &x));
2640   PetscFunctionReturn(PETSC_SUCCESS);
2641 }
2642 
MatGetRowSumAbs_SeqBAIJ(Mat A,Vec v)2643 static PetscErrorCode MatGetRowSumAbs_SeqBAIJ(Mat A, Vec v)
2644 {
2645   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2646   PetscInt     i, j, n, row, bs, *ai, mbs;
2647   PetscReal    atmp;
2648   PetscScalar *x, zero = 0.0;
2649   MatScalar   *aa;
2650   PetscInt     ncols, brow, krow, kcol;
2651 
2652   PetscFunctionBegin;
2653   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2654   bs  = A->rmap->bs;
2655   aa  = a->a;
2656   ai  = a->i;
2657   mbs = a->mbs;
2658 
2659   PetscCall(VecSet(v, zero));
2660   PetscCall(VecGetArrayWrite(v, &x));
2661   PetscCall(VecGetLocalSize(v, &n));
2662   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
2663   for (i = 0; i < mbs; i++) {
2664     ncols = ai[1] - ai[0];
2665     ai++;
2666     brow = bs * i;
2667     for (j = 0; j < ncols; j++) {
2668       for (kcol = 0; kcol < bs; kcol++) {
2669         for (krow = 0; krow < bs; krow++) {
2670           atmp = PetscAbsScalar(*aa);
2671           aa++;
2672           row = brow + krow; /* row index */
2673           x[row] += atmp;
2674         }
2675       }
2676     }
2677   }
2678   PetscCall(VecRestoreArrayWrite(v, &x));
2679   PetscFunctionReturn(PETSC_SUCCESS);
2680 }
2681 
MatCopy_SeqBAIJ(Mat A,Mat B,MatStructure str)2682 static PetscErrorCode MatCopy_SeqBAIJ(Mat A, Mat B, MatStructure str)
2683 {
2684   PetscFunctionBegin;
2685   /* If the two matrices have the same copy implementation, use fast copy. */
2686   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
2687     Mat_SeqBAIJ *a    = (Mat_SeqBAIJ *)A->data;
2688     Mat_SeqBAIJ *b    = (Mat_SeqBAIJ *)B->data;
2689     PetscInt     ambs = a->mbs, bmbs = b->mbs, abs = A->rmap->bs, bbs = B->rmap->bs, bs2 = abs * abs;
2690 
2691     PetscCheck(a->i[ambs] == b->i[bmbs], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzero blocks in matrices A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", a->i[ambs], b->i[bmbs]);
2692     PetscCheck(abs == bbs, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Block size A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", abs, bbs);
2693     PetscCall(PetscArraycpy(b->a, a->a, bs2 * a->i[ambs]));
2694     PetscCall(PetscObjectStateIncrease((PetscObject)B));
2695   } else {
2696     PetscCall(MatCopy_Basic(A, B, str));
2697   }
2698   PetscFunctionReturn(PETSC_SUCCESS);
2699 }
2700 
MatSeqBAIJGetArray_SeqBAIJ(Mat A,PetscScalar * array[])2701 static PetscErrorCode MatSeqBAIJGetArray_SeqBAIJ(Mat A, PetscScalar *array[])
2702 {
2703   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2704 
2705   PetscFunctionBegin;
2706   *array = a->a;
2707   PetscFunctionReturn(PETSC_SUCCESS);
2708 }
2709 
MatSeqBAIJRestoreArray_SeqBAIJ(Mat A,PetscScalar * array[])2710 static PetscErrorCode MatSeqBAIJRestoreArray_SeqBAIJ(Mat A, PetscScalar *array[])
2711 {
2712   PetscFunctionBegin;
2713   *array = NULL;
2714   PetscFunctionReturn(PETSC_SUCCESS);
2715 }
2716 
MatAXPYGetPreallocation_SeqBAIJ(Mat Y,Mat X,PetscInt * nnz)2717 PetscErrorCode MatAXPYGetPreallocation_SeqBAIJ(Mat Y, Mat X, PetscInt *nnz)
2718 {
2719   PetscInt     bs = Y->rmap->bs, mbs = Y->rmap->N / bs;
2720   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data;
2721   Mat_SeqBAIJ *y = (Mat_SeqBAIJ *)Y->data;
2722 
2723   PetscFunctionBegin;
2724   /* Set the number of nonzeros in the new matrix */
2725   PetscCall(MatAXPYGetPreallocation_SeqX_private(mbs, x->i, x->j, y->i, y->j, nnz));
2726   PetscFunctionReturn(PETSC_SUCCESS);
2727 }
2728 
MatAXPY_SeqBAIJ(Mat Y,PetscScalar a,Mat X,MatStructure str)2729 PetscErrorCode MatAXPY_SeqBAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2730 {
2731   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data, *y = (Mat_SeqBAIJ *)Y->data;
2732   PetscInt     bs = Y->rmap->bs, bs2 = bs * bs;
2733   PetscBLASInt one = 1;
2734 
2735   PetscFunctionBegin;
2736   if (str == UNKNOWN_NONZERO_PATTERN || (PetscDefined(USE_DEBUG) && str == SAME_NONZERO_PATTERN)) {
2737     PetscBool e = x->nz == y->nz && x->mbs == y->mbs && bs == X->rmap->bs ? PETSC_TRUE : PETSC_FALSE;
2738     if (e) {
2739       PetscCall(PetscArraycmp(x->i, y->i, x->mbs + 1, &e));
2740       if (e) {
2741         PetscCall(PetscArraycmp(x->j, y->j, x->i[x->mbs], &e));
2742         if (e) str = SAME_NONZERO_PATTERN;
2743       }
2744     }
2745     if (!e) PetscCheck(str != SAME_NONZERO_PATTERN, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "MatStructure is not SAME_NONZERO_PATTERN");
2746   }
2747   if (str == SAME_NONZERO_PATTERN) {
2748     PetscScalar  alpha = a;
2749     PetscBLASInt bnz;
2750     PetscCall(PetscBLASIntCast(x->nz * bs2, &bnz));
2751     PetscCallBLAS("BLASaxpy", BLASaxpy_(&bnz, &alpha, x->a, &one, y->a, &one));
2752     PetscCall(PetscObjectStateIncrease((PetscObject)Y));
2753   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2754     PetscCall(MatAXPY_Basic(Y, a, X, str));
2755   } else {
2756     Mat       B;
2757     PetscInt *nnz;
2758     PetscCheck(bs == X->rmap->bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrices must have same block size");
2759     PetscCall(PetscMalloc1(Y->rmap->N, &nnz));
2760     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2761     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2762     PetscCall(MatSetSizes(B, Y->rmap->n, Y->cmap->n, Y->rmap->N, Y->cmap->N));
2763     PetscCall(MatSetBlockSizesFromMats(B, Y, Y));
2764     PetscCall(MatSetType(B, (MatType)((PetscObject)Y)->type_name));
2765     PetscCall(MatAXPYGetPreallocation_SeqBAIJ(Y, X, nnz));
2766     PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz));
2767     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2768     PetscCall(MatHeaderMerge(Y, &B));
2769     PetscCall(PetscFree(nnz));
2770   }
2771   PetscFunctionReturn(PETSC_SUCCESS);
2772 }
2773 
MatConjugate_SeqBAIJ(Mat A)2774 PETSC_INTERN PetscErrorCode MatConjugate_SeqBAIJ(Mat A)
2775 {
2776   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2777   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2778   MatScalar   *aa = a->a;
2779 
2780   PetscFunctionBegin;
2781   for (i = 0; i < nz; i++) aa[i] = PetscConj(aa[i]);
2782   PetscFunctionReturn(PETSC_SUCCESS);
2783 }
2784 
MatRealPart_SeqBAIJ(Mat A)2785 static PetscErrorCode MatRealPart_SeqBAIJ(Mat A)
2786 {
2787 #if PetscDefined(USE_COMPLEX)
2788   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2789   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2790   MatScalar   *aa = a->a;
2791 
2792   PetscFunctionBegin;
2793   for (i = 0; i < nz; i++) aa[i] = PetscRealPart(aa[i]);
2794   PetscFunctionReturn(PETSC_SUCCESS);
2795 #else
2796   (void)A;
2797   return PETSC_SUCCESS;
2798 #endif
2799 }
2800 
MatImaginaryPart_SeqBAIJ(Mat A)2801 static PetscErrorCode MatImaginaryPart_SeqBAIJ(Mat A)
2802 {
2803 #if PetscDefined(USE_COMPLEX)
2804   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2805   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2806   MatScalar   *aa = a->a;
2807 
2808   PetscFunctionBegin;
2809   for (i = 0; i < nz; i++) aa[i] = PetscImaginaryPart(aa[i]);
2810   PetscFunctionReturn(PETSC_SUCCESS);
2811 #else
2812   (void)A;
2813   return PETSC_SUCCESS;
2814 #endif
2815 }
2816 
2817 /*
2818     Code almost identical to MatGetColumnIJ_SeqAIJ() should share common code
2819 */
MatGetColumnIJ_SeqBAIJ(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt * nn,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)2820 static PetscErrorCode MatGetColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2821 {
2822   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2823   PetscInt     bs = A->rmap->bs, i, *collengths, *cia, *cja, n = A->cmap->n / bs, m = A->rmap->n / bs;
2824   PetscInt     nz = a->i[m], row, *jj, mr, col;
2825 
2826   PetscFunctionBegin;
2827   *nn = n;
2828   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2829   PetscCheck(!symmetric, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not for BAIJ matrices");
2830   PetscCall(PetscCalloc1(n, &collengths));
2831   PetscCall(PetscMalloc1(n + 1, &cia));
2832   PetscCall(PetscMalloc1(nz, &cja));
2833   jj = a->j;
2834   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2835   cia[0] = oshift;
2836   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2837   PetscCall(PetscArrayzero(collengths, n));
2838   jj = a->j;
2839   for (row = 0; row < m; row++) {
2840     mr = a->i[row + 1] - a->i[row];
2841     for (i = 0; i < mr; i++) {
2842       col = *jj++;
2843 
2844       cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
2845     }
2846   }
2847   PetscCall(PetscFree(collengths));
2848   *ia = cia;
2849   *ja = cja;
2850   PetscFunctionReturn(PETSC_SUCCESS);
2851 }
2852 
MatRestoreColumnIJ_SeqBAIJ(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)2853 static PetscErrorCode MatRestoreColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2854 {
2855   PetscFunctionBegin;
2856   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2857   PetscCall(PetscFree(*ia));
2858   PetscCall(PetscFree(*ja));
2859   PetscFunctionReturn(PETSC_SUCCESS);
2860 }
2861 
2862 /*
2863  MatGetColumnIJ_SeqBAIJ_Color() and MatRestoreColumnIJ_SeqBAIJ_Color() are customized from
2864  MatGetColumnIJ_SeqBAIJ() and MatRestoreColumnIJ_SeqBAIJ() by adding an output
2865  spidx[], index of a->a, to be used in MatTransposeColoringCreate() and MatFDColoringCreate()
2866  */
MatGetColumnIJ_SeqBAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt * nn,const PetscInt * ia[],const PetscInt * ja[],PetscInt * spidx[],PetscBool * done)2867 PetscErrorCode MatGetColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2868 {
2869   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2870   PetscInt     i, *collengths, *cia, *cja, n = a->nbs, m = a->mbs;
2871   PetscInt     nz = a->i[m], row, *jj, mr, col;
2872   PetscInt    *cspidx;
2873 
2874   PetscFunctionBegin;
2875   *nn = n;
2876   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2877 
2878   PetscCall(PetscCalloc1(n, &collengths));
2879   PetscCall(PetscMalloc1(n + 1, &cia));
2880   PetscCall(PetscMalloc1(nz, &cja));
2881   PetscCall(PetscMalloc1(nz, &cspidx));
2882   jj = a->j;
2883   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2884   cia[0] = oshift;
2885   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2886   PetscCall(PetscArrayzero(collengths, n));
2887   jj = a->j;
2888   for (row = 0; row < m; row++) {
2889     mr = a->i[row + 1] - a->i[row];
2890     for (i = 0; i < mr; i++) {
2891       col                                         = *jj++;
2892       cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
2893       cja[cia[col] + collengths[col]++ - oshift]  = row + oshift;
2894     }
2895   }
2896   PetscCall(PetscFree(collengths));
2897   *ia    = cia;
2898   *ja    = cja;
2899   *spidx = cspidx;
2900   PetscFunctionReturn(PETSC_SUCCESS);
2901 }
2902 
MatRestoreColumnIJ_SeqBAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscInt * spidx[],PetscBool * done)2903 PetscErrorCode MatRestoreColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2904 {
2905   PetscFunctionBegin;
2906   PetscCall(MatRestoreColumnIJ_SeqBAIJ(A, oshift, symmetric, inodecompressed, n, ia, ja, done));
2907   PetscCall(PetscFree(*spidx));
2908   PetscFunctionReturn(PETSC_SUCCESS);
2909 }
2910 
MatShift_SeqBAIJ(Mat Y,PetscScalar a)2911 static PetscErrorCode MatShift_SeqBAIJ(Mat Y, PetscScalar a)
2912 {
2913   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)Y->data;
2914 
2915   PetscFunctionBegin;
2916   if (!Y->preallocated || !aij->nz) PetscCall(MatSeqBAIJSetPreallocation(Y, Y->rmap->bs, 1, NULL));
2917   PetscCall(MatShift_Basic(Y, a));
2918   PetscFunctionReturn(PETSC_SUCCESS);
2919 }
2920 
MatEliminateZeros_SeqBAIJ(Mat A,PetscBool keep)2921 PetscErrorCode MatEliminateZeros_SeqBAIJ(Mat A, PetscBool keep)
2922 {
2923   Mat_SeqBAIJ *a      = (Mat_SeqBAIJ *)A->data;
2924   PetscInt     fshift = 0, fshift_prev = 0, i, *ai = a->i, *aj = a->j, *imax = a->imax, j, k;
2925   PetscInt     m = A->rmap->N, *ailen = a->ilen;
2926   PetscInt     mbs = a->mbs, bs2 = a->bs2, rmax = 0;
2927   MatScalar   *aa = a->a, *ap;
2928   PetscBool    zero;
2929 
2930   PetscFunctionBegin;
2931   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot eliminate zeros for unassembled matrix");
2932   if (m) rmax = ailen[0];
2933   for (i = 1; i <= mbs; i++) {
2934     for (k = ai[i - 1]; k < ai[i]; k++) {
2935       zero = PETSC_TRUE;
2936       ap   = aa + bs2 * k;
2937       for (j = 0; j < bs2 && zero; j++) {
2938         if (ap[j] != 0.0) zero = PETSC_FALSE;
2939       }
2940       if (zero && (aj[k] != i - 1 || !keep)) fshift++;
2941       else {
2942         if (zero && aj[k] == i - 1) PetscCall(PetscInfo(A, "Keep the diagonal block at row %" PetscInt_FMT "\n", i - 1));
2943         aj[k - fshift] = aj[k];
2944         PetscCall(PetscArraymove(ap - bs2 * fshift, ap, bs2));
2945       }
2946     }
2947     ai[i - 1] -= fshift_prev;
2948     fshift_prev  = fshift;
2949     ailen[i - 1] = imax[i - 1] = ai[i] - fshift - ai[i - 1];
2950     a->nonzerorowcnt += ((ai[i] - fshift - ai[i - 1]) > 0);
2951     rmax = PetscMax(rmax, ailen[i - 1]);
2952   }
2953   if (fshift) {
2954     if (mbs) {
2955       ai[mbs] -= fshift;
2956       a->nz = ai[mbs];
2957     }
2958     PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; zeros eliminated: %" PetscInt_FMT "; nonzeros left: %" PetscInt_FMT "\n", m, A->cmap->n, fshift, a->nz));
2959     A->nonzerostate++;
2960     A->info.nz_unneeded += (PetscReal)fshift;
2961     a->rmax = rmax;
2962     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
2963     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
2964   }
2965   PetscFunctionReturn(PETSC_SUCCESS);
2966 }
2967 
2968 static struct _MatOps MatOps_Values = {MatSetValues_SeqBAIJ,
2969                                        MatGetRow_SeqBAIJ,
2970                                        MatRestoreRow_SeqBAIJ,
2971                                        MatMult_SeqBAIJ_N,
2972                                        /* 4*/ MatMultAdd_SeqBAIJ_N,
2973                                        MatMultTranspose_SeqBAIJ,
2974                                        MatMultTransposeAdd_SeqBAIJ,
2975                                        NULL,
2976                                        NULL,
2977                                        NULL,
2978                                        /* 10*/ NULL,
2979                                        MatLUFactor_SeqBAIJ,
2980                                        NULL,
2981                                        NULL,
2982                                        MatTranspose_SeqBAIJ,
2983                                        /* 15*/ MatGetInfo_SeqBAIJ,
2984                                        MatEqual_SeqBAIJ,
2985                                        MatGetDiagonal_SeqBAIJ,
2986                                        MatDiagonalScale_SeqBAIJ,
2987                                        MatNorm_SeqBAIJ,
2988                                        /* 20*/ NULL,
2989                                        MatAssemblyEnd_SeqBAIJ,
2990                                        MatSetOption_SeqBAIJ,
2991                                        MatZeroEntries_SeqBAIJ,
2992                                        /* 24*/ MatZeroRows_SeqBAIJ,
2993                                        NULL,
2994                                        NULL,
2995                                        NULL,
2996                                        NULL,
2997                                        /* 29*/ MatSetUp_Seq_Hash,
2998                                        NULL,
2999                                        NULL,
3000                                        NULL,
3001                                        NULL,
3002                                        /* 34*/ MatDuplicate_SeqBAIJ,
3003                                        NULL,
3004                                        NULL,
3005                                        MatILUFactor_SeqBAIJ,
3006                                        NULL,
3007                                        /* 39*/ MatAXPY_SeqBAIJ,
3008                                        MatCreateSubMatrices_SeqBAIJ,
3009                                        MatIncreaseOverlap_SeqBAIJ,
3010                                        MatGetValues_SeqBAIJ,
3011                                        MatCopy_SeqBAIJ,
3012                                        /* 44*/ NULL,
3013                                        MatScale_SeqBAIJ,
3014                                        MatShift_SeqBAIJ,
3015                                        NULL,
3016                                        MatZeroRowsColumns_SeqBAIJ,
3017                                        /* 49*/ NULL,
3018                                        MatGetRowIJ_SeqBAIJ,
3019                                        MatRestoreRowIJ_SeqBAIJ,
3020                                        MatGetColumnIJ_SeqBAIJ,
3021                                        MatRestoreColumnIJ_SeqBAIJ,
3022                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
3023                                        NULL,
3024                                        NULL,
3025                                        NULL,
3026                                        MatSetValuesBlocked_SeqBAIJ,
3027                                        /* 59*/ MatCreateSubMatrix_SeqBAIJ,
3028                                        MatDestroy_SeqBAIJ,
3029                                        MatView_SeqBAIJ,
3030                                        NULL,
3031                                        NULL,
3032                                        /* 64*/ NULL,
3033                                        NULL,
3034                                        NULL,
3035                                        NULL,
3036                                        MatGetRowMaxAbs_SeqBAIJ,
3037                                        /* 69*/ NULL,
3038                                        MatConvert_Basic,
3039                                        NULL,
3040                                        MatFDColoringApply_BAIJ,
3041                                        NULL,
3042                                        /* 74*/ NULL,
3043                                        NULL,
3044                                        NULL,
3045                                        NULL,
3046                                        MatLoad_SeqBAIJ,
3047                                        /* 79*/ NULL,
3048                                        NULL,
3049                                        NULL,
3050                                        NULL,
3051                                        NULL,
3052                                        /* 84*/ NULL,
3053                                        NULL,
3054                                        NULL,
3055                                        NULL,
3056                                        NULL,
3057                                        /* 89*/ NULL,
3058                                        NULL,
3059                                        NULL,
3060                                        NULL,
3061                                        MatConjugate_SeqBAIJ,
3062                                        /* 94*/ NULL,
3063                                        NULL,
3064                                        MatRealPart_SeqBAIJ,
3065                                        MatImaginaryPart_SeqBAIJ,
3066                                        NULL,
3067                                        /* 99*/ NULL,
3068                                        NULL,
3069                                        NULL,
3070                                        NULL,
3071                                        NULL,
3072                                        /*104*/ NULL,
3073                                        NULL,
3074                                        NULL,
3075                                        NULL,
3076                                        NULL,
3077                                        /*109*/ NULL,
3078                                        NULL,
3079                                        MatMultHermitianTranspose_SeqBAIJ,
3080                                        MatMultHermitianTransposeAdd_SeqBAIJ,
3081                                        NULL,
3082                                        /*114*/ NULL,
3083                                        MatGetColumnReductions_SeqBAIJ,
3084                                        MatInvertBlockDiagonal_SeqBAIJ,
3085                                        NULL,
3086                                        NULL,
3087                                        /*119*/ NULL,
3088                                        NULL,
3089                                        NULL,
3090                                        NULL,
3091                                        NULL,
3092                                        /*124*/ NULL,
3093                                        NULL,
3094                                        MatSetBlockSizes_Default,
3095                                        NULL,
3096                                        MatFDColoringSetUp_SeqXAIJ,
3097                                        /*129*/ NULL,
3098                                        MatCreateMPIMatConcatenateSeqMat_SeqBAIJ,
3099                                        MatDestroySubMatrices_SeqBAIJ,
3100                                        NULL,
3101                                        NULL,
3102                                        /*134*/ NULL,
3103                                        NULL,
3104                                        MatEliminateZeros_SeqBAIJ,
3105                                        MatGetRowSumAbs_SeqBAIJ,
3106                                        NULL,
3107                                        /*139*/ NULL,
3108                                        NULL,
3109                                        MatCopyHashToXAIJ_Seq_Hash,
3110                                        NULL,
3111                                        NULL};
3112 
MatStoreValues_SeqBAIJ(Mat mat)3113 static PetscErrorCode MatStoreValues_SeqBAIJ(Mat mat)
3114 {
3115   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3116   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;
3117 
3118   PetscFunctionBegin;
3119   PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
3120 
3121   /* allocate space for values if not already there */
3122   if (!aij->saved_values) PetscCall(PetscMalloc1(nz + 1, &aij->saved_values));
3123 
3124   /* copy values over */
3125   PetscCall(PetscArraycpy(aij->saved_values, aij->a, nz));
3126   PetscFunctionReturn(PETSC_SUCCESS);
3127 }
3128 
MatRetrieveValues_SeqBAIJ(Mat mat)3129 static PetscErrorCode MatRetrieveValues_SeqBAIJ(Mat mat)
3130 {
3131   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3132   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;
3133 
3134   PetscFunctionBegin;
3135   PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
3136   PetscCheck(aij->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
3137 
3138   /* copy values over */
3139   PetscCall(PetscArraycpy(aij->a, aij->saved_values, nz));
3140   PetscFunctionReturn(PETSC_SUCCESS);
3141 }
3142 
3143 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqAIJ(Mat, MatType, MatReuse, Mat *);
3144 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqSBAIJ(Mat, MatType, MatReuse, Mat *);
3145 
MatSeqBAIJSetPreallocation_SeqBAIJ(Mat B,PetscInt bs,PetscInt nz,const PetscInt nnz[])3146 PetscErrorCode MatSeqBAIJSetPreallocation_SeqBAIJ(Mat B, PetscInt bs, PetscInt nz, const PetscInt nnz[])
3147 {
3148   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *)B->data;
3149   PetscInt     i, mbs, nbs, bs2;
3150   PetscBool    flg = PETSC_FALSE, skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
3151 
3152   PetscFunctionBegin;
3153   if (B->hash_active) {
3154     PetscInt bs;
3155     B->ops[0] = b->cops;
3156     PetscCall(PetscHMapIJVDestroy(&b->ht));
3157     PetscCall(MatGetBlockSize(B, &bs));
3158     if (bs > 1) PetscCall(PetscHSetIJDestroy(&b->bht));
3159     PetscCall(PetscFree(b->dnz));
3160     PetscCall(PetscFree(b->bdnz));
3161     B->hash_active = PETSC_FALSE;
3162   }
3163   if (nz >= 0 || nnz) realalloc = PETSC_TRUE;
3164   if (nz == MAT_SKIP_ALLOCATION) {
3165     skipallocation = PETSC_TRUE;
3166     nz             = 0;
3167   }
3168 
3169   PetscCall(MatSetBlockSize(B, bs));
3170   PetscCall(PetscLayoutSetUp(B->rmap));
3171   PetscCall(PetscLayoutSetUp(B->cmap));
3172   PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs));
3173 
3174   B->preallocated = PETSC_TRUE;
3175 
3176   mbs = B->rmap->n / bs;
3177   nbs = B->cmap->n / bs;
3178   bs2 = bs * bs;
3179 
3180   PetscCheck(mbs * bs == B->rmap->n && nbs * bs == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows %" PetscInt_FMT ", cols %" PetscInt_FMT " must be divisible by blocksize %" PetscInt_FMT, B->rmap->N, B->cmap->n, bs);
3181 
3182   if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3183   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nz cannot be less than 0: value %" PetscInt_FMT, nz);
3184   if (nnz) {
3185     for (i = 0; i < mbs; i++) {
3186       PetscCheck(nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, nnz[i]);
3187       PetscCheck(nnz[i] <= nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be greater than block row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, nnz[i], nbs);
3188     }
3189   }
3190 
3191   PetscOptionsBegin(PetscObjectComm((PetscObject)B), NULL, "Optimize options for SEQBAIJ matrix 2 ", "Mat");
3192   PetscCall(PetscOptionsBool("-mat_no_unroll", "Do not optimize for block size (slow)", NULL, flg, &flg, NULL));
3193   PetscOptionsEnd();
3194 
3195   if (!flg) {
3196     switch (bs) {
3197     case 1:
3198       B->ops->mult    = MatMult_SeqBAIJ_1;
3199       B->ops->multadd = MatMultAdd_SeqBAIJ_1;
3200       break;
3201     case 2:
3202       B->ops->mult    = MatMult_SeqBAIJ_2;
3203       B->ops->multadd = MatMultAdd_SeqBAIJ_2;
3204       break;
3205     case 3:
3206       B->ops->mult    = MatMult_SeqBAIJ_3;
3207       B->ops->multadd = MatMultAdd_SeqBAIJ_3;
3208       break;
3209     case 4:
3210       B->ops->mult    = MatMult_SeqBAIJ_4;
3211       B->ops->multadd = MatMultAdd_SeqBAIJ_4;
3212       break;
3213     case 5:
3214       B->ops->mult    = MatMult_SeqBAIJ_5;
3215       B->ops->multadd = MatMultAdd_SeqBAIJ_5;
3216       break;
3217     case 6:
3218       B->ops->mult    = MatMult_SeqBAIJ_6;
3219       B->ops->multadd = MatMultAdd_SeqBAIJ_6;
3220       break;
3221     case 7:
3222       B->ops->mult    = MatMult_SeqBAIJ_7;
3223       B->ops->multadd = MatMultAdd_SeqBAIJ_7;
3224       break;
3225     case 9: {
3226       PetscInt version = 1;
3227       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3228       switch (version) {
3229 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3230       case 1:
3231         B->ops->mult    = MatMult_SeqBAIJ_9_AVX2;
3232         B->ops->multadd = MatMultAdd_SeqBAIJ_9_AVX2;
3233         PetscCall(PetscInfo(B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3234         break;
3235 #endif
3236       default:
3237         B->ops->mult    = MatMult_SeqBAIJ_N;
3238         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3239         PetscCall(PetscInfo(B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3240         break;
3241       }
3242       break;
3243     }
3244     case 11:
3245       B->ops->mult    = MatMult_SeqBAIJ_11;
3246       B->ops->multadd = MatMultAdd_SeqBAIJ_11;
3247       break;
3248     case 12: {
3249       PetscInt version = 1;
3250       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3251       switch (version) {
3252       case 1:
3253         B->ops->mult    = MatMult_SeqBAIJ_12_ver1;
3254         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3255         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3256         break;
3257       case 2:
3258         B->ops->mult    = MatMult_SeqBAIJ_12_ver2;
3259         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver2;
3260         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3261         break;
3262 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3263       case 3:
3264         B->ops->mult    = MatMult_SeqBAIJ_12_AVX2;
3265         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3266         PetscCall(PetscInfo(B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3267         break;
3268 #endif
3269       default:
3270         B->ops->mult    = MatMult_SeqBAIJ_N;
3271         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3272         PetscCall(PetscInfo(B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3273         break;
3274       }
3275       break;
3276     }
3277     case 15: {
3278       PetscInt version = 1;
3279       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3280       switch (version) {
3281       case 1:
3282         B->ops->mult = MatMult_SeqBAIJ_15_ver1;
3283         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3284         break;
3285       case 2:
3286         B->ops->mult = MatMult_SeqBAIJ_15_ver2;
3287         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3288         break;
3289       case 3:
3290         B->ops->mult = MatMult_SeqBAIJ_15_ver3;
3291         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3292         break;
3293       case 4:
3294         B->ops->mult = MatMult_SeqBAIJ_15_ver4;
3295         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3296         break;
3297       default:
3298         B->ops->mult = MatMult_SeqBAIJ_N;
3299         PetscCall(PetscInfo(B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3300         break;
3301       }
3302       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3303       break;
3304     }
3305     default:
3306       B->ops->mult    = MatMult_SeqBAIJ_N;
3307       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3308       PetscCall(PetscInfo(B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3309       break;
3310     }
3311   }
3312   B->ops->sor = MatSOR_SeqBAIJ;
3313   b->mbs      = mbs;
3314   b->nbs      = nbs;
3315   if (!skipallocation) {
3316     if (!b->imax) {
3317       PetscCall(PetscMalloc2(mbs, &b->imax, mbs, &b->ilen));
3318 
3319       b->free_imax_ilen = PETSC_TRUE;
3320     }
3321     /* b->ilen will count nonzeros in each block row so far. */
3322     for (i = 0; i < mbs; i++) b->ilen[i] = 0;
3323     if (!nnz) {
3324       if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3325       else if (nz < 0) nz = 1;
3326       nz = PetscMin(nz, nbs);
3327       for (i = 0; i < mbs; i++) b->imax[i] = nz;
3328       PetscCall(PetscIntMultError(nz, mbs, &nz));
3329     } else {
3330       PetscInt64 nz64 = 0;
3331       for (i = 0; i < mbs; i++) {
3332         b->imax[i] = nnz[i];
3333         nz64 += nnz[i];
3334       }
3335       PetscCall(PetscIntCast(nz64, &nz));
3336     }
3337 
3338     /* allocate the matrix space */
3339     PetscCall(MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i));
3340     PetscCall(PetscShmgetAllocateArray(nz, sizeof(PetscInt), (void **)&b->j));
3341     PetscCall(PetscShmgetAllocateArray(B->rmap->N + 1, sizeof(PetscInt), (void **)&b->i));
3342     if (B->structure_only) {
3343       b->free_a = PETSC_FALSE;
3344     } else {
3345       PetscInt nzbs2 = 0;
3346       PetscCall(PetscIntMultError(nz, bs2, &nzbs2));
3347       PetscCall(PetscShmgetAllocateArray(nzbs2, sizeof(PetscScalar), (void **)&b->a));
3348       b->free_a = PETSC_TRUE;
3349       PetscCall(PetscArrayzero(b->a, nzbs2));
3350     }
3351     b->free_ij = PETSC_TRUE;
3352     PetscCall(PetscArrayzero(b->j, nz));
3353 
3354     b->i[0] = 0;
3355     for (i = 1; i < mbs + 1; i++) b->i[i] = b->i[i - 1] + b->imax[i - 1];
3356   } else {
3357     b->free_a  = PETSC_FALSE;
3358     b->free_ij = PETSC_FALSE;
3359   }
3360 
3361   b->bs2              = bs2;
3362   b->mbs              = mbs;
3363   b->nz               = 0;
3364   b->maxnz            = nz;
3365   B->info.nz_unneeded = (PetscReal)b->maxnz * bs2;
3366   B->was_assembled    = PETSC_FALSE;
3367   B->assembled        = PETSC_FALSE;
3368   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
3369   PetscFunctionReturn(PETSC_SUCCESS);
3370 }
3371 
MatSeqBAIJSetPreallocationCSR_SeqBAIJ(Mat B,PetscInt bs,const PetscInt ii[],const PetscInt jj[],const PetscScalar V[])3372 static PetscErrorCode MatSeqBAIJSetPreallocationCSR_SeqBAIJ(Mat B, PetscInt bs, const PetscInt ii[], const PetscInt jj[], const PetscScalar V[])
3373 {
3374   PetscInt     i, m, nz, nz_max = 0, *nnz;
3375   PetscScalar *values      = NULL;
3376   PetscBool    roworiented = ((Mat_SeqBAIJ *)B->data)->roworiented;
3377 
3378   PetscFunctionBegin;
3379   PetscCheck(bs >= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Invalid block size specified, must be positive but it is %" PetscInt_FMT, bs);
3380   PetscCall(PetscLayoutSetBlockSize(B->rmap, bs));
3381   PetscCall(PetscLayoutSetBlockSize(B->cmap, bs));
3382   PetscCall(PetscLayoutSetUp(B->rmap));
3383   PetscCall(PetscLayoutSetUp(B->cmap));
3384   PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs));
3385   m = B->rmap->n / bs;
3386 
3387   PetscCheck(ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "ii[0] must be 0 but it is %" PetscInt_FMT, ii[0]);
3388   PetscCall(PetscMalloc1(m + 1, &nnz));
3389   for (i = 0; i < m; i++) {
3390     nz = ii[i + 1] - ii[i];
3391     PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative number of columns %" PetscInt_FMT, i, nz);
3392     nz_max = PetscMax(nz_max, nz);
3393     nnz[i] = nz;
3394   }
3395   PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz));
3396   PetscCall(PetscFree(nnz));
3397 
3398   values = (PetscScalar *)V;
3399   if (!values) PetscCall(PetscCalloc1(bs * bs * (nz_max + 1), &values));
3400   for (i = 0; i < m; i++) {
3401     PetscInt        ncols = ii[i + 1] - ii[i];
3402     const PetscInt *icols = jj + ii[i];
3403     if (bs == 1 || !roworiented) {
3404       const PetscScalar *svals = values + (V ? (bs * bs * ii[i]) : 0);
3405       PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, ncols, icols, svals, INSERT_VALUES));
3406     } else {
3407       PetscInt j;
3408       for (j = 0; j < ncols; j++) {
3409         const PetscScalar *svals = values + (V ? (bs * bs * (ii[i] + j)) : 0);
3410         PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, 1, &icols[j], svals, INSERT_VALUES));
3411       }
3412     }
3413   }
3414   if (!V) PetscCall(PetscFree(values));
3415   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3416   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3417   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3418   PetscFunctionReturn(PETSC_SUCCESS);
3419 }
3420 
3421 /*@C
3422   MatSeqBAIJGetArray - gives read/write access to the array where the data for a `MATSEQBAIJ` matrix is stored
3423 
3424   Not Collective
3425 
3426   Input Parameter:
3427 . A - a `MATSEQBAIJ` matrix
3428 
3429   Output Parameter:
3430 . array - pointer to the data
3431 
3432   Level: intermediate
3433 
3434 .seealso: [](ch_matrices), `Mat`, `MATSEQBAIJ`, `MatSeqBAIJRestoreArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3435 @*/
MatSeqBAIJGetArray(Mat A,PetscScalar * array[])3436 PetscErrorCode MatSeqBAIJGetArray(Mat A, PetscScalar *array[])
3437 {
3438   PetscFunctionBegin;
3439   PetscUseMethod(A, "MatSeqBAIJGetArray_C", (Mat, PetscScalar **), (A, array));
3440   PetscFunctionReturn(PETSC_SUCCESS);
3441 }
3442 
3443 /*@C
3444   MatSeqBAIJRestoreArray - returns access to the array where the data for a `MATSEQBAIJ` matrix is stored obtained by `MatSeqBAIJGetArray()`
3445 
3446   Not Collective
3447 
3448   Input Parameters:
3449 + A     - a `MATSEQBAIJ` matrix
3450 - array - pointer to the data
3451 
3452   Level: intermediate
3453 
3454 .seealso: [](ch_matrices), `Mat`, `MatSeqBAIJGetArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3455 @*/
MatSeqBAIJRestoreArray(Mat A,PetscScalar * array[])3456 PetscErrorCode MatSeqBAIJRestoreArray(Mat A, PetscScalar *array[])
3457 {
3458   PetscFunctionBegin;
3459   PetscUseMethod(A, "MatSeqBAIJRestoreArray_C", (Mat, PetscScalar **), (A, array));
3460   PetscFunctionReturn(PETSC_SUCCESS);
3461 }
3462 
3463 /*MC
3464    MATSEQBAIJ - MATSEQBAIJ = "seqbaij" - A matrix type to be used for sequential block sparse matrices, based on
3465    block sparse compressed row format.
3466 
3467    Options Database Keys:
3468 + -mat_type seqbaij - sets the matrix type to `MATSEQBAIJ` during a call to `MatSetFromOptions()`
3469 - -mat_baij_mult_version version - indicate the version of the matrix-vector product to use (0 often indicates using BLAS)
3470 
3471    Level: beginner
3472 
3473    Notes:
3474    `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
3475    space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
3476 
3477    Run with `-info` to see what version of the matrix-vector product is being used
3478 
3479 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqBAIJ()`
3480 M*/
3481 
3482 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat, MatType, MatReuse, Mat *);
3483 
MatCreate_SeqBAIJ(Mat B)3484 PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJ(Mat B)
3485 {
3486   PetscMPIInt  size;
3487   Mat_SeqBAIJ *b;
3488 
3489   PetscFunctionBegin;
3490   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
3491   PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Comm must be of size 1");
3492 
3493   PetscCall(PetscNew(&b));
3494   B->data   = (void *)b;
3495   B->ops[0] = MatOps_Values;
3496 
3497   b->row          = NULL;
3498   b->col          = NULL;
3499   b->icol         = NULL;
3500   b->reallocs     = 0;
3501   b->saved_values = NULL;
3502 
3503   b->roworiented        = PETSC_TRUE;
3504   b->nonew              = 0;
3505   b->diag               = NULL;
3506   B->spptr              = NULL;
3507   B->info.nz_unneeded   = (PetscReal)b->maxnz * b->bs2;
3508   b->keepnonzeropattern = PETSC_FALSE;
3509 
3510   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJGetArray_C", MatSeqBAIJGetArray_SeqBAIJ));
3511   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJRestoreArray_C", MatSeqBAIJRestoreArray_SeqBAIJ));
3512   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqBAIJ));
3513   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqBAIJ));
3514   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetColumnIndices_C", MatSeqBAIJSetColumnIndices_SeqBAIJ));
3515   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqaij_C", MatConvert_SeqBAIJ_SeqAIJ));
3516   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqsbaij_C", MatConvert_SeqBAIJ_SeqSBAIJ));
3517   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocation_C", MatSeqBAIJSetPreallocation_SeqBAIJ));
3518   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocationCSR_C", MatSeqBAIJSetPreallocationCSR_SeqBAIJ));
3519   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_SeqBAIJ));
3520 #if defined(PETSC_HAVE_HYPRE)
3521   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_hypre_C", MatConvert_AIJ_HYPRE));
3522 #endif
3523   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_is_C", MatConvert_XAIJ_IS));
3524   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ));
3525   PetscFunctionReturn(PETSC_SUCCESS);
3526 }
3527 
MatDuplicateNoCreate_SeqBAIJ(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)3528 PETSC_INTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
3529 {
3530   Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data, *a = (Mat_SeqBAIJ *)A->data;
3531   PetscInt     i, mbs = a->mbs, nz = a->nz, bs2 = a->bs2;
3532 
3533   PetscFunctionBegin;
3534   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Cannot duplicate unassembled matrix");
3535   PetscCheck(a->i[mbs] == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupt matrix");
3536 
3537   if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3538     c->imax           = a->imax;
3539     c->ilen           = a->ilen;
3540     c->free_imax_ilen = PETSC_FALSE;
3541   } else {
3542     PetscCall(PetscMalloc2(mbs, &c->imax, mbs, &c->ilen));
3543     for (i = 0; i < mbs; i++) {
3544       c->imax[i] = a->imax[i];
3545       c->ilen[i] = a->ilen[i];
3546     }
3547     c->free_imax_ilen = PETSC_TRUE;
3548   }
3549 
3550   /* allocate the matrix space */
3551   if (mallocmatspace) {
3552     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3553       PetscCall(PetscShmgetAllocateArray(bs2 * nz, sizeof(PetscScalar), (void **)&c->a));
3554       PetscCall(PetscArrayzero(c->a, bs2 * nz));
3555       c->free_a       = PETSC_TRUE;
3556       c->i            = a->i;
3557       c->j            = a->j;
3558       c->free_ij      = PETSC_FALSE;
3559       c->parent       = A;
3560       C->preallocated = PETSC_TRUE;
3561       C->assembled    = PETSC_TRUE;
3562 
3563       PetscCall(PetscObjectReference((PetscObject)A));
3564       PetscCall(MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3565       PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3566     } else {
3567       PetscCall(PetscShmgetAllocateArray(bs2 * nz, sizeof(PetscScalar), (void **)&c->a));
3568       PetscCall(PetscShmgetAllocateArray(nz, sizeof(PetscInt), (void **)&c->j));
3569       PetscCall(PetscShmgetAllocateArray(mbs + 1, sizeof(PetscInt), (void **)&c->i));
3570       c->free_a  = PETSC_TRUE;
3571       c->free_ij = PETSC_TRUE;
3572 
3573       PetscCall(PetscArraycpy(c->i, a->i, mbs + 1));
3574       if (mbs > 0) {
3575         PetscCall(PetscArraycpy(c->j, a->j, nz));
3576         if (cpvalues == MAT_COPY_VALUES) {
3577           PetscCall(PetscArraycpy(c->a, a->a, bs2 * nz));
3578         } else {
3579           PetscCall(PetscArrayzero(c->a, bs2 * nz));
3580         }
3581       }
3582       C->preallocated = PETSC_TRUE;
3583       C->assembled    = PETSC_TRUE;
3584     }
3585   }
3586 
3587   c->roworiented = a->roworiented;
3588   c->nonew       = a->nonew;
3589 
3590   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
3591   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
3592 
3593   c->bs2        = a->bs2;
3594   c->mbs        = a->mbs;
3595   c->nbs        = a->nbs;
3596   c->nz         = a->nz;
3597   c->maxnz      = a->nz; /* Since we allocate exactly the right amount */
3598   c->solve_work = NULL;
3599   c->mult_work  = NULL;
3600   c->sor_workt  = NULL;
3601   c->sor_work   = NULL;
3602 
3603   c->compressedrow.use   = a->compressedrow.use;
3604   c->compressedrow.nrows = a->compressedrow.nrows;
3605   if (a->compressedrow.use) {
3606     i = a->compressedrow.nrows;
3607     PetscCall(PetscMalloc2(i + 1, &c->compressedrow.i, i + 1, &c->compressedrow.rindex));
3608     PetscCall(PetscArraycpy(c->compressedrow.i, a->compressedrow.i, i + 1));
3609     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, i));
3610   } else {
3611     c->compressedrow.use    = PETSC_FALSE;
3612     c->compressedrow.i      = NULL;
3613     c->compressedrow.rindex = NULL;
3614   }
3615   c->nonzerorowcnt = a->nonzerorowcnt;
3616   C->nonzerostate  = A->nonzerostate;
3617 
3618   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
3619   PetscFunctionReturn(PETSC_SUCCESS);
3620 }
3621 
MatDuplicate_SeqBAIJ(Mat A,MatDuplicateOption cpvalues,Mat * B)3622 PetscErrorCode MatDuplicate_SeqBAIJ(Mat A, MatDuplicateOption cpvalues, Mat *B)
3623 {
3624   PetscFunctionBegin;
3625   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
3626   PetscCall(MatSetSizes(*B, A->rmap->N, A->cmap->n, A->rmap->N, A->cmap->n));
3627   PetscCall(MatSetType(*B, MATSEQBAIJ));
3628   PetscCall(MatDuplicateNoCreate_SeqBAIJ(*B, A, cpvalues, PETSC_TRUE));
3629   PetscFunctionReturn(PETSC_SUCCESS);
3630 }
3631 
3632 /* Used for both SeqBAIJ and SeqSBAIJ matrices */
MatLoad_SeqBAIJ_Binary(Mat mat,PetscViewer viewer)3633 PetscErrorCode MatLoad_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
3634 {
3635   PetscInt     header[4], M, N, nz, bs, m, n, mbs, nbs, rows, cols, sum, i, j, k;
3636   PetscInt    *rowidxs, *colidxs;
3637   PetscScalar *matvals;
3638 
3639   PetscFunctionBegin;
3640   PetscCall(PetscViewerSetUp(viewer));
3641 
3642   /* read matrix header */
3643   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3644   PetscCheck(header[0] == MAT_FILE_CLASSID, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3645   M  = header[1];
3646   N  = header[2];
3647   nz = header[3];
3648   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3649   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3650   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as SeqBAIJ");
3651 
3652   /* set block sizes from the viewer's .info file */
3653   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3654   /* set local and global sizes if not set already */
3655   if (mat->rmap->n < 0) mat->rmap->n = M;
3656   if (mat->cmap->n < 0) mat->cmap->n = N;
3657   if (mat->rmap->N < 0) mat->rmap->N = M;
3658   if (mat->cmap->N < 0) mat->cmap->N = N;
3659   PetscCall(PetscLayoutSetUp(mat->rmap));
3660   PetscCall(PetscLayoutSetUp(mat->cmap));
3661 
3662   /* check if the matrix sizes are correct */
3663   PetscCall(MatGetSize(mat, &rows, &cols));
3664   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3665   PetscCall(MatGetBlockSize(mat, &bs));
3666   PetscCall(MatGetLocalSize(mat, &m, &n));
3667   mbs = m / bs;
3668   nbs = n / bs;
3669 
3670   /* read in row lengths, column indices and nonzero values */
3671   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3672   PetscCall(PetscViewerBinaryRead(viewer, rowidxs + 1, m, NULL, PETSC_INT));
3673   rowidxs[0] = 0;
3674   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3675   sum = rowidxs[m];
3676   PetscCheck(sum == nz, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3677 
3678   /* read in column indices and nonzero values */
3679   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, nz, &matvals));
3680   PetscCall(PetscViewerBinaryRead(viewer, colidxs, rowidxs[m], NULL, PETSC_INT));
3681   PetscCall(PetscViewerBinaryRead(viewer, matvals, rowidxs[m], NULL, PETSC_SCALAR));
3682 
3683   {               /* preallocate matrix storage */
3684     PetscBT   bt; /* helper bit set to count nonzeros */
3685     PetscInt *nnz;
3686     PetscBool sbaij;
3687 
3688     PetscCall(PetscBTCreate(nbs, &bt));
3689     PetscCall(PetscCalloc1(mbs, &nnz));
3690     PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATSEQSBAIJ, &sbaij));
3691     for (i = 0; i < mbs; i++) {
3692       PetscCall(PetscBTMemzero(nbs, bt));
3693       for (k = 0; k < bs; k++) {
3694         PetscInt row = bs * i + k;
3695         for (j = rowidxs[row]; j < rowidxs[row + 1]; j++) {
3696           PetscInt col = colidxs[j];
3697           if (!sbaij || col >= row)
3698             if (!PetscBTLookupSet(bt, col / bs)) nnz[i]++;
3699         }
3700       }
3701     }
3702     PetscCall(PetscBTDestroy(&bt));
3703     PetscCall(MatSeqBAIJSetPreallocation(mat, bs, 0, nnz));
3704     PetscCall(MatSeqSBAIJSetPreallocation(mat, bs, 0, nnz));
3705     PetscCall(PetscFree(nnz));
3706   }
3707 
3708   /* store matrix values */
3709   for (i = 0; i < m; i++) {
3710     PetscInt row = i, s = rowidxs[i], e = rowidxs[i + 1];
3711     PetscUseTypeMethod(mat, setvalues, 1, &row, e - s, colidxs + s, matvals + s, INSERT_VALUES);
3712   }
3713 
3714   PetscCall(PetscFree(rowidxs));
3715   PetscCall(PetscFree2(colidxs, matvals));
3716   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
3717   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
3718   PetscFunctionReturn(PETSC_SUCCESS);
3719 }
3720 
MatLoad_SeqBAIJ(Mat mat,PetscViewer viewer)3721 PetscErrorCode MatLoad_SeqBAIJ(Mat mat, PetscViewer viewer)
3722 {
3723   PetscBool isbinary;
3724 
3725   PetscFunctionBegin;
3726   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3727   PetscCheck(isbinary, PetscObjectComm((PetscObject)viewer), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)mat)->type_name);
3728   PetscCall(MatLoad_SeqBAIJ_Binary(mat, viewer));
3729   PetscFunctionReturn(PETSC_SUCCESS);
3730 }
3731 
3732 /*@
3733   MatCreateSeqBAIJ - Creates a sparse matrix in `MATSEQAIJ` (block
3734   compressed row) format.  For good matrix assembly performance the
3735   user should preallocate the matrix storage by setting the parameter `nz`
3736   (or the array `nnz`).
3737 
3738   Collective
3739 
3740   Input Parameters:
3741 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3742 . bs   - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3743          blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3744 . m    - number of rows
3745 . n    - number of columns
3746 . nz   - number of nonzero blocks  per block row (same for all rows)
3747 - nnz  - array containing the number of nonzero blocks in the various block rows
3748          (possibly different for each block row) or `NULL`
3749 
3750   Output Parameter:
3751 . A - the matrix
3752 
3753   Options Database Keys:
3754 + -mat_no_unroll  - uses code that does not unroll the loops in the block calculations (much slower)
3755 - -mat_block_size - size of the blocks to use
3756 
3757   Level: intermediate
3758 
3759   Notes:
3760   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3761   MatXXXXSetPreallocation() paradigm instead of this routine directly.
3762   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3763 
3764   The number of rows and columns must be divisible by blocksize.
3765 
3766   If the `nnz` parameter is given then the `nz` parameter is ignored
3767 
3768   A nonzero block is any block that as 1 or more nonzeros in it
3769 
3770   The `MATSEQBAIJ` format is fully compatible with standard Fortran
3771   storage.  That is, the stored row and column indices can begin at
3772   either one (as in Fortran) or zero.
3773 
3774   Specify the preallocated storage with either `nz` or `nnz` (not both).
3775   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3776   allocation.  See [Sparse Matrices](sec_matsparse) for details.
3777   matrices.
3778 
3779 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`
3780 @*/
MatCreateSeqBAIJ(MPI_Comm comm,PetscInt bs,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)3781 PetscErrorCode MatCreateSeqBAIJ(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3782 {
3783   PetscFunctionBegin;
3784   PetscCall(MatCreate(comm, A));
3785   PetscCall(MatSetSizes(*A, m, n, m, n));
3786   PetscCall(MatSetType(*A, MATSEQBAIJ));
3787   PetscCall(MatSeqBAIJSetPreallocation(*A, bs, nz, (PetscInt *)nnz));
3788   PetscFunctionReturn(PETSC_SUCCESS);
3789 }
3790 
3791 /*@
3792   MatSeqBAIJSetPreallocation - Sets the block size and expected nonzeros
3793   per row in the matrix. For good matrix assembly performance the
3794   user should preallocate the matrix storage by setting the parameter `nz`
3795   (or the array `nnz`).
3796 
3797   Collective
3798 
3799   Input Parameters:
3800 + B   - the matrix
3801 . bs  - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3802         blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3803 . nz  - number of block nonzeros per block row (same for all rows)
3804 - nnz - array containing the number of block nonzeros in the various block rows
3805         (possibly different for each block row) or `NULL`
3806 
3807   Options Database Keys:
3808 + -mat_no_unroll  - uses code that does not unroll the loops in the block calculations (much slower)
3809 - -mat_block_size - size of the blocks to use
3810 
3811   Level: intermediate
3812 
3813   Notes:
3814   If the `nnz` parameter is given then the `nz` parameter is ignored
3815 
3816   You can call `MatGetInfo()` to get information on how effective the preallocation was;
3817   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
3818   You can also run with the option `-info` and look for messages with the string
3819   malloc in them to see if additional memory allocation was needed.
3820 
3821   The `MATSEQBAIJ` format is fully compatible with standard Fortran
3822   storage.  That is, the stored row and column indices can begin at
3823   either one (as in Fortran) or zero.
3824 
3825   Specify the preallocated storage with either `nz` or `nnz` (not both).
3826   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3827   allocation.  See [Sparse Matrices](sec_matsparse) for details.
3828 
3829 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`, `MatGetInfo()`
3830 @*/
MatSeqBAIJSetPreallocation(Mat B,PetscInt bs,PetscInt nz,const PetscInt nnz[])3831 PetscErrorCode MatSeqBAIJSetPreallocation(Mat B, PetscInt bs, PetscInt nz, const PetscInt nnz[])
3832 {
3833   PetscFunctionBegin;
3834   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
3835   PetscValidType(B, 1);
3836   PetscValidLogicalCollectiveInt(B, bs, 2);
3837   PetscTryMethod(B, "MatSeqBAIJSetPreallocation_C", (Mat, PetscInt, PetscInt, const PetscInt[]), (B, bs, nz, nnz));
3838   PetscFunctionReturn(PETSC_SUCCESS);
3839 }
3840 
3841 /*@C
3842   MatSeqBAIJSetPreallocationCSR - Creates a sparse sequential matrix in `MATSEQBAIJ` format using the given nonzero structure and (optional) numerical values
3843 
3844   Collective
3845 
3846   Input Parameters:
3847 + B  - the matrix
3848 . bs - the blocksize
3849 . i  - the indices into `j` for the start of each local row (indices start with zero)
3850 . j  - the column indices for each local row (indices start with zero) these must be sorted for each row
3851 - v  - optional values in the matrix, use `NULL` if not provided
3852 
3853   Level: advanced
3854 
3855   Notes:
3856   The `i`,`j`,`v` values are COPIED with this routine; to avoid the copy use `MatCreateSeqBAIJWithArrays()`
3857 
3858   The order of the entries in values is specified by the `MatOption` `MAT_ROW_ORIENTED`.  For example, C programs
3859   may want to use the default `MAT_ROW_ORIENTED` of `PETSC_TRUE` and use an array v[nnz][bs][bs] where the second index is
3860   over rows within a block and the last index is over columns within a block row.  Fortran programs will likely set
3861   `MAT_ROW_ORIENTED` of `PETSC_FALSE` and use a Fortran array v(bs,bs,nnz) in which the first index is over rows within a
3862   block column and the second index is over columns within a block.
3863 
3864   Though this routine has Preallocation() in the name it also sets the exact nonzero locations of the matrix entries and usually the numerical values as well
3865 
3866 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqBAIJ()`, `MatSetValues()`, `MatSeqBAIJSetPreallocation()`, `MATSEQBAIJ`
3867 @*/
MatSeqBAIJSetPreallocationCSR(Mat B,PetscInt bs,const PetscInt i[],const PetscInt j[],const PetscScalar v[])3868 PetscErrorCode MatSeqBAIJSetPreallocationCSR(Mat B, PetscInt bs, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
3869 {
3870   PetscFunctionBegin;
3871   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
3872   PetscValidType(B, 1);
3873   PetscValidLogicalCollectiveInt(B, bs, 2);
3874   PetscTryMethod(B, "MatSeqBAIJSetPreallocationCSR_C", (Mat, PetscInt, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, bs, i, j, v));
3875   PetscFunctionReturn(PETSC_SUCCESS);
3876 }
3877 
3878 /*@
3879   MatCreateSeqBAIJWithArrays - Creates a `MATSEQBAIJ` matrix using matrix elements provided by the user.
3880 
3881   Collective
3882 
3883   Input Parameters:
3884 + comm - must be an MPI communicator of size 1
3885 . bs   - size of block
3886 . m    - number of rows
3887 . n    - number of columns
3888 . i    - row indices; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row block row of the matrix
3889 . j    - column indices
3890 - a    - matrix values
3891 
3892   Output Parameter:
3893 . mat - the matrix
3894 
3895   Level: advanced
3896 
3897   Notes:
3898   The `i`, `j`, and `a` arrays are not copied by this routine, the user must free these arrays
3899   once the matrix is destroyed
3900 
3901   You cannot set new nonzero locations into this matrix, that will generate an error.
3902 
3903   The `i` and `j` indices are 0 based
3904 
3905   When block size is greater than 1 the matrix values must be stored using the `MATSEQBAIJ` storage format
3906 
3907   The order of the entries in values is the same as the block compressed sparse row storage format; that is, it is
3908   the same as a three dimensional array in Fortran values(bs,bs,nnz) that contains the first column of the first
3909   block, followed by the second column of the first block etc etc.  That is, the blocks are contiguous in memory
3910   with column-major ordering within blocks.
3911 
3912 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateBAIJ()`, `MatCreateSeqBAIJ()`
3913 @*/
MatCreateSeqBAIJWithArrays(MPI_Comm comm,PetscInt bs,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],Mat * mat)3914 PetscErrorCode MatCreateSeqBAIJWithArrays(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], Mat *mat)
3915 {
3916   Mat_SeqBAIJ *baij;
3917 
3918   PetscFunctionBegin;
3919   PetscCheck(bs == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "block size %" PetscInt_FMT " > 1 is not supported yet", bs);
3920   if (m > 0) PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
3921 
3922   PetscCall(MatCreate(comm, mat));
3923   PetscCall(MatSetSizes(*mat, m, n, m, n));
3924   PetscCall(MatSetType(*mat, MATSEQBAIJ));
3925   PetscCall(MatSeqBAIJSetPreallocation(*mat, bs, MAT_SKIP_ALLOCATION, NULL));
3926   baij = (Mat_SeqBAIJ *)(*mat)->data;
3927   PetscCall(PetscMalloc2(m, &baij->imax, m, &baij->ilen));
3928 
3929   baij->i = i;
3930   baij->j = j;
3931   baij->a = a;
3932 
3933   baij->nonew          = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
3934   baij->free_a         = PETSC_FALSE;
3935   baij->free_ij        = PETSC_FALSE;
3936   baij->free_imax_ilen = PETSC_TRUE;
3937 
3938   for (PetscInt ii = 0; ii < m; ii++) {
3939     const PetscInt row_len = i[ii + 1] - i[ii];
3940 
3941     baij->ilen[ii] = baij->imax[ii] = row_len;
3942     PetscCheck(row_len >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative row length in i (row indices) row = %" PetscInt_FMT " length = %" PetscInt_FMT, ii, row_len);
3943   }
3944   if (PetscDefined(USE_DEBUG)) {
3945     for (PetscInt ii = 0; ii < baij->i[m]; ii++) {
3946       PetscCheck(j[ii] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative column index at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]);
3947       PetscCheck(j[ii] <= n - 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index to large at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]);
3948     }
3949   }
3950 
3951   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
3952   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
3953   PetscFunctionReturn(PETSC_SUCCESS);
3954 }
3955 
MatCreateMPIMatConcatenateSeqMat_SeqBAIJ(MPI_Comm comm,Mat inmat,PetscInt n,MatReuse scall,Mat * outmat)3956 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_SeqBAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
3957 {
3958   PetscFunctionBegin;
3959   PetscCall(MatCreateMPIMatConcatenateSeqMat_MPIBAIJ(comm, inmat, n, scall, outmat));
3960   PetscFunctionReturn(PETSC_SUCCESS);
3961 }
3962