xref: /petsc/src/mat/impls/aij/mpi/fdmpiaij.c (revision 43cdf1ebbcbfa05bee08e48007ef1bae3f20f4e9)
1 #include <../src/mat/impls/sell/mpi/mpisell.h>
2 #include <../src/mat/impls/aij/mpi/mpiaij.h>
3 #include <../src/mat/impls/baij/mpi/mpibaij.h>
4 #include <petsc/private/isimpl.h>
5 
6 PetscErrorCode MatFDColoringApply_BAIJ(Mat J, MatFDColoring coloring, Vec x1, void *sctx)
7 {
8   PetscErrorCode (*f)(void *, Vec, Vec, void *) = (PetscErrorCode(*)(void *, Vec, Vec, void *))coloring->f;
9   PetscInt           k, cstart, cend, l, row, col, nz, spidx, i, j;
10   PetscScalar        dx = 0.0, *w3_array, *dy_i, *dy = coloring->dy;
11   PetscScalar       *vscale_array;
12   const PetscScalar *xx;
13   PetscReal          epsilon = coloring->error_rel, umin = coloring->umin, unorm;
14   Vec                w1 = coloring->w1, w2 = coloring->w2, w3, vscale = coloring->vscale;
15   void              *fctx  = coloring->fctx;
16   PetscInt           ctype = coloring->ctype, nxloc, nrows_k;
17   PetscScalar       *valaddr;
18   MatEntry          *Jentry  = coloring->matentry;
19   MatEntry2         *Jentry2 = coloring->matentry2;
20   const PetscInt     ncolors = coloring->ncolors, *ncolumns = coloring->ncolumns, *nrows = coloring->nrows;
21   PetscInt           bs = J->rmap->bs;
22 
23   PetscFunctionBegin;
24   PetscCall(VecBindToCPU(x1, PETSC_TRUE));
25   /* (1) Set w1 = F(x1) */
26   if (!coloring->fset) {
27     PetscCall(PetscLogEventBegin(MAT_FDColoringFunction, coloring, 0, 0, 0));
28     PetscCall((*f)(sctx, x1, w1, fctx));
29     PetscCall(PetscLogEventEnd(MAT_FDColoringFunction, coloring, 0, 0, 0));
30   } else {
31     coloring->fset = PETSC_FALSE;
32   }
33 
34   /* (2) Compute vscale = 1./dx - the local scale factors, including ghost points */
35   PetscCall(VecGetLocalSize(x1, &nxloc));
36   if (coloring->htype[0] == 'w') {
37     /* vscale = dx is a constant scalar */
38     PetscCall(VecNorm(x1, NORM_2, &unorm));
39     dx = 1.0 / (PetscSqrtReal(1.0 + unorm) * epsilon);
40   } else {
41     PetscCall(VecGetArrayRead(x1, &xx));
42     PetscCall(VecGetArray(vscale, &vscale_array));
43     for (col = 0; col < nxloc; col++) {
44       dx = xx[col];
45       if (PetscAbsScalar(dx) < umin) {
46         if (PetscRealPart(dx) >= 0.0) dx = umin;
47         else if (PetscRealPart(dx) < 0.0) dx = -umin;
48       }
49       dx *= epsilon;
50       vscale_array[col] = 1.0 / dx;
51     }
52     PetscCall(VecRestoreArrayRead(x1, &xx));
53     PetscCall(VecRestoreArray(vscale, &vscale_array));
54   }
55   if (ctype == IS_COLORING_GLOBAL && coloring->htype[0] == 'd') {
56     PetscCall(VecGhostUpdateBegin(vscale, INSERT_VALUES, SCATTER_FORWARD));
57     PetscCall(VecGhostUpdateEnd(vscale, INSERT_VALUES, SCATTER_FORWARD));
58   }
59 
60   /* (3) Loop over each color */
61   if (!coloring->w3) {
62     PetscCall(VecDuplicate(x1, &coloring->w3));
63     /* Vec is used intensively in particular piece of scalar CPU code; won't benefit from bouncing back and forth to the GPU */
64     PetscCall(VecBindToCPU(coloring->w3, PETSC_TRUE));
65   }
66   w3 = coloring->w3;
67 
68   PetscCall(VecGetOwnershipRange(x1, &cstart, &cend)); /* used by ghosted vscale */
69   if (vscale) PetscCall(VecGetArray(vscale, &vscale_array));
70   nz = 0;
71   for (k = 0; k < ncolors; k++) {
72     coloring->currentcolor = k;
73 
74     /*
75       (3-1) Loop over each column associated with color
76       adding the perturbation to the vector w3 = x1 + dx.
77     */
78     PetscCall(VecCopy(x1, w3));
79     dy_i = dy;
80     for (i = 0; i < bs; i++) { /* Loop over a block of columns */
81       PetscCall(VecGetArray(w3, &w3_array));
82       if (ctype == IS_COLORING_GLOBAL) w3_array -= cstart; /* shift pointer so global index can be used */
83       if (coloring->htype[0] == 'w') {
84         for (l = 0; l < ncolumns[k]; l++) {
85           col = i + bs * coloring->columns[k][l]; /* local column (in global index!) of the matrix we are probing for */
86           w3_array[col] += 1.0 / dx;
87           if (i) w3_array[col - 1] -= 1.0 / dx; /* resume original w3[col-1] */
88         }
89       } else {                  /* htype == 'ds' */
90         vscale_array -= cstart; /* shift pointer so global index can be used */
91         for (l = 0; l < ncolumns[k]; l++) {
92           col = i + bs * coloring->columns[k][l]; /* local column (in global index!) of the matrix we are probing for */
93           w3_array[col] += 1.0 / vscale_array[col];
94           if (i) w3_array[col - 1] -= 1.0 / vscale_array[col - 1]; /* resume original w3[col-1] */
95         }
96         vscale_array += cstart;
97       }
98       if (ctype == IS_COLORING_GLOBAL) w3_array += cstart;
99       PetscCall(VecRestoreArray(w3, &w3_array));
100 
101       /*
102        (3-2) Evaluate function at w3 = x1 + dx (here dx is a vector of perturbations)
103                            w2 = F(x1 + dx) - F(x1)
104        */
105       PetscCall(PetscLogEventBegin(MAT_FDColoringFunction, 0, 0, 0, 0));
106       PetscCall(VecPlaceArray(w2, dy_i)); /* place w2 to the array dy_i */
107       PetscCall((*f)(sctx, w3, w2, fctx));
108       PetscCall(PetscLogEventEnd(MAT_FDColoringFunction, 0, 0, 0, 0));
109       PetscCall(VecAXPY(w2, -1.0, w1));
110       PetscCall(VecResetArray(w2));
111       dy_i += nxloc; /* points to dy+i*nxloc */
112     }
113 
114     /*
115      (3-3) Loop over rows of vector, putting results into Jacobian matrix
116     */
117     nrows_k = nrows[k];
118     if (coloring->htype[0] == 'w') {
119       for (l = 0; l < nrows_k; l++) {
120         row     = bs * Jentry2[nz].row; /* local row index */
121         valaddr = Jentry2[nz++].valaddr;
122         spidx   = 0;
123         dy_i    = dy;
124         for (i = 0; i < bs; i++) {   /* column of the block */
125           for (j = 0; j < bs; j++) { /* row of the block */
126             valaddr[spidx++] = dy_i[row + j] * dx;
127           }
128           dy_i += nxloc; /* points to dy+i*nxloc */
129         }
130       }
131     } else { /* htype == 'ds' */
132       for (l = 0; l < nrows_k; l++) {
133         row     = bs * Jentry[nz].row; /* local row index */
134         col     = bs * Jentry[nz].col; /* local column index */
135         valaddr = Jentry[nz++].valaddr;
136         spidx   = 0;
137         dy_i    = dy;
138         for (i = 0; i < bs; i++) {   /* column of the block */
139           for (j = 0; j < bs; j++) { /* row of the block */
140             valaddr[spidx++] = dy_i[row + j] * vscale_array[col + i];
141           }
142           dy_i += nxloc; /* points to dy+i*nxloc */
143         }
144       }
145     }
146   }
147   PetscCall(MatAssemblyBegin(J, MAT_FINAL_ASSEMBLY));
148   PetscCall(MatAssemblyEnd(J, MAT_FINAL_ASSEMBLY));
149   if (vscale) PetscCall(VecRestoreArray(vscale, &vscale_array));
150 
151   coloring->currentcolor = -1;
152   PetscCall(VecBindToCPU(x1, PETSC_FALSE));
153   PetscFunctionReturn(0);
154 }
155 
156 /* this is declared PETSC_EXTERN because it is used by MatFDColoringUseDM() which is in the DM library */
157 PetscErrorCode MatFDColoringApply_AIJ(Mat J, MatFDColoring coloring, Vec x1, void *sctx)
158 {
159   PetscErrorCode (*f)(void *, Vec, Vec, void *) = (PetscErrorCode(*)(void *, Vec, Vec, void *))coloring->f;
160   PetscInt           k, cstart, cend, l, row, col, nz;
161   PetscScalar        dx = 0.0, *y, *w3_array;
162   const PetscScalar *xx;
163   PetscScalar       *vscale_array;
164   PetscReal          epsilon = coloring->error_rel, umin = coloring->umin, unorm;
165   Vec                w1 = coloring->w1, w2 = coloring->w2, w3, vscale = coloring->vscale;
166   void              *fctx  = coloring->fctx;
167   ISColoringType     ctype = coloring->ctype;
168   PetscInt           nxloc, nrows_k;
169   MatEntry          *Jentry  = coloring->matentry;
170   MatEntry2         *Jentry2 = coloring->matentry2;
171   const PetscInt     ncolors = coloring->ncolors, *ncolumns = coloring->ncolumns, *nrows = coloring->nrows;
172   PetscBool          alreadyboundtocpu;
173 
174   PetscFunctionBegin;
175   PetscCall(VecBoundToCPU(x1, &alreadyboundtocpu));
176   PetscCall(VecBindToCPU(x1, PETSC_TRUE));
177   PetscCheck(!(ctype == IS_COLORING_LOCAL) || !(J->ops->fdcoloringapply == MatFDColoringApply_AIJ), PetscObjectComm((PetscObject)J), PETSC_ERR_SUP, "Must call MatColoringUseDM() with IS_COLORING_LOCAL");
178   /* (1) Set w1 = F(x1) */
179   if (!coloring->fset) {
180     PetscCall(PetscLogEventBegin(MAT_FDColoringFunction, 0, 0, 0, 0));
181     PetscCall((*f)(sctx, x1, w1, fctx));
182     PetscCall(PetscLogEventEnd(MAT_FDColoringFunction, 0, 0, 0, 0));
183   } else {
184     coloring->fset = PETSC_FALSE;
185   }
186 
187   /* (2) Compute vscale = 1./dx - the local scale factors, including ghost points */
188   if (coloring->htype[0] == 'w') {
189     /* vscale = 1./dx is a constant scalar */
190     PetscCall(VecNorm(x1, NORM_2, &unorm));
191     dx = 1.0 / (PetscSqrtReal(1.0 + unorm) * epsilon);
192   } else {
193     PetscCall(VecGetLocalSize(x1, &nxloc));
194     PetscCall(VecGetArrayRead(x1, &xx));
195     PetscCall(VecGetArray(vscale, &vscale_array));
196     for (col = 0; col < nxloc; col++) {
197       dx = xx[col];
198       if (PetscAbsScalar(dx) < umin) {
199         if (PetscRealPart(dx) >= 0.0) dx = umin;
200         else if (PetscRealPart(dx) < 0.0) dx = -umin;
201       }
202       dx *= epsilon;
203       vscale_array[col] = 1.0 / dx;
204     }
205     PetscCall(VecRestoreArrayRead(x1, &xx));
206     PetscCall(VecRestoreArray(vscale, &vscale_array));
207   }
208   if (ctype == IS_COLORING_GLOBAL && coloring->htype[0] == 'd') {
209     PetscCall(VecGhostUpdateBegin(vscale, INSERT_VALUES, SCATTER_FORWARD));
210     PetscCall(VecGhostUpdateEnd(vscale, INSERT_VALUES, SCATTER_FORWARD));
211   }
212 
213   /* (3) Loop over each color */
214   if (!coloring->w3) { PetscCall(VecDuplicate(x1, &coloring->w3)); }
215   w3 = coloring->w3;
216 
217   PetscCall(VecGetOwnershipRange(x1, &cstart, &cend)); /* used by ghosted vscale */
218   if (vscale) PetscCall(VecGetArray(vscale, &vscale_array));
219   nz = 0;
220 
221   if (coloring->bcols > 1) { /* use blocked insertion of Jentry */
222     PetscInt     i, m = J->rmap->n, nbcols, bcols = coloring->bcols;
223     PetscScalar *dy = coloring->dy, *dy_k;
224 
225     nbcols = 0;
226     for (k = 0; k < ncolors; k += bcols) {
227       /*
228        (3-1) Loop over each column associated with color
229        adding the perturbation to the vector w3 = x1 + dx.
230        */
231 
232       dy_k = dy;
233       if (k + bcols > ncolors) bcols = ncolors - k;
234       for (i = 0; i < bcols; i++) {
235         coloring->currentcolor = k + i;
236 
237         PetscCall(VecCopy(x1, w3));
238         PetscCall(VecGetArray(w3, &w3_array));
239         if (ctype == IS_COLORING_GLOBAL) w3_array -= cstart; /* shift pointer so global index can be used */
240         if (coloring->htype[0] == 'w') {
241           for (l = 0; l < ncolumns[k + i]; l++) {
242             col = coloring->columns[k + i][l]; /* local column (in global index!) of the matrix we are probing for */
243             w3_array[col] += 1.0 / dx;
244           }
245         } else {                  /* htype == 'ds' */
246           vscale_array -= cstart; /* shift pointer so global index can be used */
247           for (l = 0; l < ncolumns[k + i]; l++) {
248             col = coloring->columns[k + i][l]; /* local column (in global index!) of the matrix we are probing for */
249             w3_array[col] += 1.0 / vscale_array[col];
250           }
251           vscale_array += cstart;
252         }
253         if (ctype == IS_COLORING_GLOBAL) w3_array += cstart;
254         PetscCall(VecRestoreArray(w3, &w3_array));
255 
256         /*
257          (3-2) Evaluate function at w3 = x1 + dx (here dx is a vector of perturbations)
258                            w2 = F(x1 + dx) - F(x1)
259          */
260         PetscCall(PetscLogEventBegin(MAT_FDColoringFunction, 0, 0, 0, 0));
261         PetscCall(VecPlaceArray(w2, dy_k)); /* place w2 to the array dy_i */
262         PetscCall((*f)(sctx, w3, w2, fctx));
263         PetscCall(PetscLogEventEnd(MAT_FDColoringFunction, 0, 0, 0, 0));
264         PetscCall(VecAXPY(w2, -1.0, w1));
265         PetscCall(VecResetArray(w2));
266         dy_k += m; /* points to dy+i*nxloc */
267       }
268 
269       /*
270        (3-3) Loop over block rows of vector, putting results into Jacobian matrix
271        */
272       nrows_k = nrows[nbcols++];
273 
274       if (coloring->htype[0] == 'w') {
275         for (l = 0; l < nrows_k; l++) {
276           row = Jentry2[nz].row; /* local row index */
277                                  /* The 'useless' ifdef is due to a bug in NVIDIA nvc 21.11, which triggers a segfault on this line. We write it in
278              another way, and it seems work. See https://lists.mcs.anl.gov/pipermail/petsc-users/2021-December/045158.html
279            */
280 #if defined(PETSC_USE_COMPLEX)
281           PetscScalar *tmp = Jentry2[nz].valaddr;
282           *tmp             = dy[row] * dx;
283 #else
284           *(Jentry2[nz].valaddr) = dy[row] * dx;
285 #endif
286           nz++;
287         }
288       } else { /* htype == 'ds' */
289         for (l = 0; l < nrows_k; l++) {
290           row = Jentry[nz].row; /* local row index */
291 #if defined(PETSC_USE_COMPLEX)  /* See https://lists.mcs.anl.gov/pipermail/petsc-users/2021-December/045158.html */
292           PetscScalar *tmp = Jentry[nz].valaddr;
293           *tmp             = dy[row] * vscale_array[Jentry[nz].col];
294 #else
295           *(Jentry[nz].valaddr)  = dy[row] * vscale_array[Jentry[nz].col];
296 #endif
297           nz++;
298         }
299       }
300     }
301   } else { /* bcols == 1 */
302     for (k = 0; k < ncolors; k++) {
303       coloring->currentcolor = k;
304 
305       /*
306        (3-1) Loop over each column associated with color
307        adding the perturbation to the vector w3 = x1 + dx.
308        */
309       PetscCall(VecCopy(x1, w3));
310       PetscCall(VecGetArray(w3, &w3_array));
311       if (ctype == IS_COLORING_GLOBAL) w3_array -= cstart; /* shift pointer so global index can be used */
312       if (coloring->htype[0] == 'w') {
313         for (l = 0; l < ncolumns[k]; l++) {
314           col = coloring->columns[k][l]; /* local column (in global index!) of the matrix we are probing for */
315           w3_array[col] += 1.0 / dx;
316         }
317       } else {                  /* htype == 'ds' */
318         vscale_array -= cstart; /* shift pointer so global index can be used */
319         for (l = 0; l < ncolumns[k]; l++) {
320           col = coloring->columns[k][l]; /* local column (in global index!) of the matrix we are probing for */
321           w3_array[col] += 1.0 / vscale_array[col];
322         }
323         vscale_array += cstart;
324       }
325       if (ctype == IS_COLORING_GLOBAL) w3_array += cstart;
326       PetscCall(VecRestoreArray(w3, &w3_array));
327 
328       /*
329        (3-2) Evaluate function at w3 = x1 + dx (here dx is a vector of perturbations)
330                            w2 = F(x1 + dx) - F(x1)
331        */
332       PetscCall(PetscLogEventBegin(MAT_FDColoringFunction, 0, 0, 0, 0));
333       PetscCall((*f)(sctx, w3, w2, fctx));
334       PetscCall(PetscLogEventEnd(MAT_FDColoringFunction, 0, 0, 0, 0));
335       PetscCall(VecAXPY(w2, -1.0, w1));
336 
337       /*
338        (3-3) Loop over rows of vector, putting results into Jacobian matrix
339        */
340       nrows_k = nrows[k];
341       PetscCall(VecGetArray(w2, &y));
342       if (coloring->htype[0] == 'w') {
343         for (l = 0; l < nrows_k; l++) {
344           row = Jentry2[nz].row; /* local row index */
345 #if defined(PETSC_USE_COMPLEX)   /* See https://lists.mcs.anl.gov/pipermail/petsc-users/2021-December/045158.html */
346           PetscScalar *tmp = Jentry2[nz].valaddr;
347           *tmp             = y[row] * dx;
348 #else
349           *(Jentry2[nz].valaddr) = y[row] * dx;
350 #endif
351           nz++;
352         }
353       } else { /* htype == 'ds' */
354         for (l = 0; l < nrows_k; l++) {
355           row = Jentry[nz].row; /* local row index */
356 #if defined(PETSC_USE_COMPLEX)  /* See https://lists.mcs.anl.gov/pipermail/petsc-users/2021-December/045158.html */
357           PetscScalar *tmp = Jentry[nz].valaddr;
358           *tmp             = y[row] * vscale_array[Jentry[nz].col];
359 #else
360           *(Jentry[nz].valaddr)  = y[row] * vscale_array[Jentry[nz].col];
361 #endif
362           nz++;
363         }
364       }
365       PetscCall(VecRestoreArray(w2, &y));
366     }
367   }
368 
369 #if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
370   if (J->offloadmask != PETSC_OFFLOAD_UNALLOCATED) J->offloadmask = PETSC_OFFLOAD_CPU;
371 #endif
372   PetscCall(MatAssemblyBegin(J, MAT_FINAL_ASSEMBLY));
373   PetscCall(MatAssemblyEnd(J, MAT_FINAL_ASSEMBLY));
374   if (vscale) PetscCall(VecRestoreArray(vscale, &vscale_array));
375   coloring->currentcolor = -1;
376   if (!alreadyboundtocpu) PetscCall(VecBindToCPU(x1, PETSC_FALSE));
377   PetscFunctionReturn(0);
378 }
379 
380 PetscErrorCode MatFDColoringSetUp_MPIXAIJ(Mat mat, ISColoring iscoloring, MatFDColoring c)
381 {
382   PetscMPIInt            size, *ncolsonproc, *disp, nn;
383   PetscInt               i, n, nrows, nrows_i, j, k, m, ncols, col, *rowhit, cstart, cend, colb;
384   const PetscInt        *is, *A_ci, *A_cj, *B_ci, *B_cj, *row = NULL, *ltog = NULL;
385   PetscInt               nis = iscoloring->n, nctot, *cols, tmp = 0;
386   ISLocalToGlobalMapping map   = mat->cmap->mapping;
387   PetscInt               ctype = c->ctype, *spidxA, *spidxB, nz, bs, bs2, spidx;
388   Mat                    A, B;
389   PetscScalar           *A_val, *B_val, **valaddrhit;
390   MatEntry              *Jentry;
391   MatEntry2             *Jentry2;
392   PetscBool              isBAIJ, isSELL;
393   PetscInt               bcols = c->bcols;
394 #if defined(PETSC_USE_CTABLE)
395   PetscHMapI colmap = NULL;
396 #else
397   PetscInt *colmap = NULL;      /* local col number of off-diag col */
398 #endif
399 
400   PetscFunctionBegin;
401   if (ctype == IS_COLORING_LOCAL) {
402     PetscCheck(map, PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_INCOMP, "When using ghosted differencing matrix must have local to global mapping provided with MatSetLocalToGlobalMapping");
403     PetscCall(ISLocalToGlobalMappingGetIndices(map, &ltog));
404   }
405 
406   PetscCall(MatGetBlockSize(mat, &bs));
407   PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATMPIBAIJ, &isBAIJ));
408   PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATMPISELL, &isSELL));
409   if (isBAIJ) {
410     Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *)mat->data;
411     Mat_SeqBAIJ *spA, *spB;
412     A     = baij->A;
413     spA   = (Mat_SeqBAIJ *)A->data;
414     A_val = spA->a;
415     B     = baij->B;
416     spB   = (Mat_SeqBAIJ *)B->data;
417     B_val = spB->a;
418     nz    = spA->nz + spB->nz; /* total nonzero entries of mat */
419     if (!baij->colmap) PetscCall(MatCreateColmap_MPIBAIJ_Private(mat));
420     colmap = baij->colmap;
421     PetscCall(MatGetColumnIJ_SeqBAIJ_Color(A, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &A_ci, &A_cj, &spidxA, NULL));
422     PetscCall(MatGetColumnIJ_SeqBAIJ_Color(B, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &B_ci, &B_cj, &spidxB, NULL));
423 
424     if (ctype == IS_COLORING_GLOBAL && c->htype[0] == 'd') { /* create vscale for storing dx */
425       PetscInt *garray;
426       PetscCall(PetscMalloc1(B->cmap->n, &garray));
427       for (i = 0; i < baij->B->cmap->n / bs; i++) {
428         for (j = 0; j < bs; j++) garray[i * bs + j] = bs * baij->garray[i] + j;
429       }
430       PetscCall(VecCreateGhost(PetscObjectComm((PetscObject)mat), mat->cmap->n, PETSC_DETERMINE, B->cmap->n, garray, &c->vscale));
431       PetscCall(VecBindToCPU(c->vscale, PETSC_TRUE));
432       PetscCall(PetscFree(garray));
433     }
434   } else if (isSELL) {
435     Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;
436     Mat_SeqSELL *spA, *spB;
437     A     = sell->A;
438     spA   = (Mat_SeqSELL *)A->data;
439     A_val = spA->val;
440     B     = sell->B;
441     spB   = (Mat_SeqSELL *)B->data;
442     B_val = spB->val;
443     nz    = spA->nz + spB->nz; /* total nonzero entries of mat */
444     if (!sell->colmap) {
445       /* Allow access to data structures of local part of matrix
446        - creates aij->colmap which maps global column number to local number in part B */
447       PetscCall(MatCreateColmap_MPISELL_Private(mat));
448     }
449     colmap = sell->colmap;
450     PetscCall(MatGetColumnIJ_SeqSELL_Color(A, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &A_ci, &A_cj, &spidxA, NULL));
451     PetscCall(MatGetColumnIJ_SeqSELL_Color(B, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &B_ci, &B_cj, &spidxB, NULL));
452 
453     bs = 1; /* only bs=1 is supported for non MPIBAIJ matrix */
454 
455     if (ctype == IS_COLORING_GLOBAL && c->htype[0] == 'd') { /* create vscale for storing dx */
456       PetscCall(VecCreateGhost(PetscObjectComm((PetscObject)mat), mat->cmap->n, PETSC_DETERMINE, B->cmap->n, sell->garray, &c->vscale));
457       PetscCall(VecBindToCPU(c->vscale, PETSC_TRUE));
458     }
459   } else {
460     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
461     Mat_SeqAIJ *spA, *spB;
462     A     = aij->A;
463     spA   = (Mat_SeqAIJ *)A->data;
464     A_val = spA->a;
465     B     = aij->B;
466     spB   = (Mat_SeqAIJ *)B->data;
467     B_val = spB->a;
468     nz    = spA->nz + spB->nz; /* total nonzero entries of mat */
469     if (!aij->colmap) {
470       /* Allow access to data structures of local part of matrix
471        - creates aij->colmap which maps global column number to local number in part B */
472       PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
473     }
474     colmap = aij->colmap;
475     PetscCall(MatGetColumnIJ_SeqAIJ_Color(A, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &A_ci, &A_cj, &spidxA, NULL));
476     PetscCall(MatGetColumnIJ_SeqAIJ_Color(B, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &B_ci, &B_cj, &spidxB, NULL));
477 
478     bs = 1; /* only bs=1 is supported for non MPIBAIJ matrix */
479 
480     if (ctype == IS_COLORING_GLOBAL && c->htype[0] == 'd') { /* create vscale for storing dx */
481       PetscCall(VecCreateGhost(PetscObjectComm((PetscObject)mat), mat->cmap->n, PETSC_DETERMINE, B->cmap->n, aij->garray, &c->vscale));
482       PetscCall(VecBindToCPU(c->vscale, PETSC_TRUE));
483     }
484   }
485 
486   m      = mat->rmap->n / bs;
487   cstart = mat->cmap->rstart / bs;
488   cend   = mat->cmap->rend / bs;
489 
490   PetscCall(PetscMalloc2(nis, &c->ncolumns, nis, &c->columns));
491   PetscCall(PetscMalloc1(nis, &c->nrows));
492 
493   if (c->htype[0] == 'd') {
494     PetscCall(PetscMalloc1(nz, &Jentry));
495     c->matentry = Jentry;
496   } else if (c->htype[0] == 'w') {
497     PetscCall(PetscMalloc1(nz, &Jentry2));
498     c->matentry2 = Jentry2;
499   } else SETERRQ(PetscObjectComm((PetscObject)mat), PETSC_ERR_SUP, "htype is not supported");
500 
501   PetscCall(PetscMalloc2(m + 1, &rowhit, m + 1, &valaddrhit));
502   nz = 0;
503   PetscCall(ISColoringGetIS(iscoloring, PETSC_OWN_POINTER, PETSC_IGNORE, &c->isa));
504 
505   if (ctype == IS_COLORING_GLOBAL) {
506     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
507     PetscCall(PetscMalloc2(size, &ncolsonproc, size, &disp));
508   }
509 
510   for (i = 0; i < nis; i++) { /* for each local color */
511     PetscCall(ISGetLocalSize(c->isa[i], &n));
512     PetscCall(ISGetIndices(c->isa[i], &is));
513 
514     c->ncolumns[i] = n; /* local number of columns of this color on this process */
515     c->columns[i]  = (PetscInt *)is;
516 
517     if (ctype == IS_COLORING_GLOBAL) {
518       /* Determine nctot, the total (parallel) number of columns of this color */
519       /* ncolsonproc[j]: local ncolumns on proc[j] of this color */
520       PetscCall(PetscMPIIntCast(n, &nn));
521       PetscCallMPI(MPI_Allgather(&nn, 1, MPI_INT, ncolsonproc, 1, MPI_INT, PetscObjectComm((PetscObject)mat)));
522       nctot = 0;
523       for (j = 0; j < size; j++) nctot += ncolsonproc[j];
524       if (!nctot) PetscCall(PetscInfo(mat, "Coloring of matrix has some unneeded colors with no corresponding rows\n"));
525 
526       disp[0] = 0;
527       for (j = 1; j < size; j++) disp[j] = disp[j - 1] + ncolsonproc[j - 1];
528 
529       /* Get cols, the complete list of columns for this color on each process */
530       PetscCall(PetscMalloc1(nctot + 1, &cols));
531       PetscCallMPI(MPI_Allgatherv((void *)is, n, MPIU_INT, cols, ncolsonproc, disp, MPIU_INT, PetscObjectComm((PetscObject)mat)));
532     } else if (ctype == IS_COLORING_LOCAL) {
533       /* Determine local number of columns of this color on this process, including ghost points */
534       nctot = n;
535       cols  = (PetscInt *)is;
536     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not provided for this MatFDColoring type");
537 
538     /* Mark all rows affect by these columns */
539     PetscCall(PetscArrayzero(rowhit, m));
540     bs2     = bs * bs;
541     nrows_i = 0;
542     for (j = 0; j < nctot; j++) { /* loop over columns*/
543       if (ctype == IS_COLORING_LOCAL) {
544         col = ltog[cols[j]];
545       } else {
546         col = cols[j];
547       }
548       if (col >= cstart && col < cend) { /* column is in A, diagonal block of mat */
549         tmp   = A_ci[col - cstart];
550         row   = A_cj + tmp;
551         nrows = A_ci[col - cstart + 1] - tmp;
552         nrows_i += nrows;
553 
554         /* loop over columns of A marking them in rowhit */
555         for (k = 0; k < nrows; k++) {
556           /* set valaddrhit for part A */
557           spidx            = bs2 * spidxA[tmp + k];
558           valaddrhit[*row] = &A_val[spidx];
559           rowhit[*row++]   = col - cstart + 1; /* local column index */
560         }
561       } else { /* column is in B, off-diagonal block of mat */
562 #if defined(PETSC_USE_CTABLE)
563         PetscCall(PetscHMapIGetWithDefault(colmap, col + 1, 0, &colb));
564         colb--;
565 #else
566         colb = colmap[col] - 1; /* local column index */
567 #endif
568         if (colb == -1) {
569           nrows = 0;
570         } else {
571           colb  = colb / bs;
572           tmp   = B_ci[colb];
573           row   = B_cj + tmp;
574           nrows = B_ci[colb + 1] - tmp;
575         }
576         nrows_i += nrows;
577         /* loop over columns of B marking them in rowhit */
578         for (k = 0; k < nrows; k++) {
579           /* set valaddrhit for part B */
580           spidx            = bs2 * spidxB[tmp + k];
581           valaddrhit[*row] = &B_val[spidx];
582           rowhit[*row++]   = colb + 1 + cend - cstart; /* local column index */
583         }
584       }
585     }
586     c->nrows[i] = nrows_i;
587 
588     if (c->htype[0] == 'd') {
589       for (j = 0; j < m; j++) {
590         if (rowhit[j]) {
591           Jentry[nz].row     = j;             /* local row index */
592           Jentry[nz].col     = rowhit[j] - 1; /* local column index */
593           Jentry[nz].valaddr = valaddrhit[j]; /* address of mat value for this entry */
594           nz++;
595         }
596       }
597     } else { /* c->htype == 'wp' */
598       for (j = 0; j < m; j++) {
599         if (rowhit[j]) {
600           Jentry2[nz].row     = j;             /* local row index */
601           Jentry2[nz].valaddr = valaddrhit[j]; /* address of mat value for this entry */
602           nz++;
603         }
604       }
605     }
606     if (ctype == IS_COLORING_GLOBAL) PetscCall(PetscFree(cols));
607   }
608   if (ctype == IS_COLORING_GLOBAL) PetscCall(PetscFree2(ncolsonproc, disp));
609 
610   if (bcols > 1) { /* reorder Jentry for faster MatFDColoringApply() */
611     PetscCall(MatFDColoringSetUpBlocked_AIJ_Private(mat, c, nz));
612   }
613 
614   if (isBAIJ) {
615     PetscCall(MatRestoreColumnIJ_SeqBAIJ_Color(A, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &A_ci, &A_cj, &spidxA, NULL));
616     PetscCall(MatRestoreColumnIJ_SeqBAIJ_Color(B, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &B_ci, &B_cj, &spidxB, NULL));
617     PetscCall(PetscMalloc1(bs * mat->rmap->n, &c->dy));
618   } else if (isSELL) {
619     PetscCall(MatRestoreColumnIJ_SeqSELL_Color(A, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &A_ci, &A_cj, &spidxA, NULL));
620     PetscCall(MatRestoreColumnIJ_SeqSELL_Color(B, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &B_ci, &B_cj, &spidxB, NULL));
621   } else {
622     PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(A, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &A_ci, &A_cj, &spidxA, NULL));
623     PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(B, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &B_ci, &B_cj, &spidxB, NULL));
624   }
625 
626   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_OWN_POINTER, &c->isa));
627   PetscCall(PetscFree2(rowhit, valaddrhit));
628 
629   if (ctype == IS_COLORING_LOCAL) PetscCall(ISLocalToGlobalMappingRestoreIndices(map, &ltog));
630   PetscCall(PetscInfo(c, "ncolors %" PetscInt_FMT ", brows %" PetscInt_FMT " and bcols %" PetscInt_FMT " are used.\n", c->ncolors, c->brows, c->bcols));
631   PetscFunctionReturn(0);
632 }
633 
634 PetscErrorCode MatFDColoringCreate_MPIXAIJ(Mat mat, ISColoring iscoloring, MatFDColoring c)
635 {
636   PetscInt  bs, nis = iscoloring->n, m = mat->rmap->n;
637   PetscBool isBAIJ, isSELL;
638 
639   PetscFunctionBegin;
640   /* set default brows and bcols for speedup inserting the dense matrix into sparse Jacobian;
641    bcols is chosen s.t. dy-array takes 50% of memory space as mat */
642   PetscCall(MatGetBlockSize(mat, &bs));
643   PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATMPIBAIJ, &isBAIJ));
644   PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATMPISELL, &isSELL));
645   if (isBAIJ || m == 0) {
646     c->brows = m;
647     c->bcols = 1;
648   } else if (isSELL) {
649     /* bcols is chosen s.t. dy-array takes 50% of local memory space as mat */
650     Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;
651     Mat_SeqSELL *spA, *spB;
652     Mat          A, B;
653     PetscInt     nz, brows, bcols;
654     PetscReal    mem;
655 
656     bs = 1; /* only bs=1 is supported for MPISELL matrix */
657 
658     A     = sell->A;
659     spA   = (Mat_SeqSELL *)A->data;
660     B     = sell->B;
661     spB   = (Mat_SeqSELL *)B->data;
662     nz    = spA->nz + spB->nz; /* total local nonzero entries of mat */
663     mem   = nz * (sizeof(PetscScalar) + sizeof(PetscInt)) + 3 * m * sizeof(PetscInt);
664     bcols = (PetscInt)(0.5 * mem / (m * sizeof(PetscScalar)));
665     brows = 1000 / bcols;
666     if (bcols > nis) bcols = nis;
667     if (brows == 0 || brows > m) brows = m;
668     c->brows = brows;
669     c->bcols = bcols;
670   } else { /* mpiaij matrix */
671     /* bcols is chosen s.t. dy-array takes 50% of local memory space as mat */
672     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
673     Mat_SeqAIJ *spA, *spB;
674     Mat         A, B;
675     PetscInt    nz, brows, bcols;
676     PetscReal   mem;
677 
678     bs = 1; /* only bs=1 is supported for MPIAIJ matrix */
679 
680     A     = aij->A;
681     spA   = (Mat_SeqAIJ *)A->data;
682     B     = aij->B;
683     spB   = (Mat_SeqAIJ *)B->data;
684     nz    = spA->nz + spB->nz; /* total local nonzero entries of mat */
685     mem   = nz * (sizeof(PetscScalar) + sizeof(PetscInt)) + 3 * m * sizeof(PetscInt);
686     bcols = (PetscInt)(0.5 * mem / (m * sizeof(PetscScalar)));
687     brows = 1000 / bcols;
688     if (bcols > nis) bcols = nis;
689     if (brows == 0 || brows > m) brows = m;
690     c->brows = brows;
691     c->bcols = bcols;
692   }
693 
694   c->M       = mat->rmap->N / bs; /* set the global rows and columns and local rows */
695   c->N       = mat->cmap->N / bs;
696   c->m       = mat->rmap->n / bs;
697   c->rstart  = mat->rmap->rstart / bs;
698   c->ncolors = nis;
699   PetscFunctionReturn(0);
700 }
701 
702 /*@C
703 
704     MatFDColoringSetValues - takes a matrix in compressed color format and enters the matrix into a PETSc `Mat`
705 
706    Collective
707 
708    Input Parameters:
709 +    J - the sparse matrix
710 .    coloring - created with `MatFDColoringCreate()` and a local coloring
711 -    y - column major storage of matrix values with one color of values per column, the number of rows of y should match
712          the number of local rows of J and the number of columns is the number of colors.
713 
714    Level: intermediate
715 
716    Notes: the matrix in compressed color format may come from an automatic differentiation code
717 
718    The code will be slightly faster if `MatFDColoringSetBlockSize`(coloring,`PETSC_DEFAULT`,nc); is called immediately after creating the coloring
719 
720 .seealso: `MatFDColoringCreate()`, `ISColoring`, `ISColoringCreate()`, `ISColoringSetType()`, `IS_COLORING_LOCAL`, `MatFDColoringSetBlockSize()`
721 @*/
722 PetscErrorCode MatFDColoringSetValues(Mat J, MatFDColoring coloring, const PetscScalar *y)
723 {
724   MatEntry2      *Jentry2;
725   PetscInt        row, i, nrows_k, l, ncolors, nz = 0, bcols, nbcols = 0;
726   const PetscInt *nrows;
727   PetscBool       eq;
728 
729   PetscFunctionBegin;
730   PetscValidHeaderSpecific(J, MAT_CLASSID, 1);
731   PetscValidHeaderSpecific(coloring, MAT_FDCOLORING_CLASSID, 2);
732   PetscCall(PetscObjectCompareId((PetscObject)J, coloring->matid, &eq));
733   PetscCheck(eq, PetscObjectComm((PetscObject)J), PETSC_ERR_ARG_WRONG, "Matrix used with MatFDColoringSetValues() must be that used with MatFDColoringCreate()");
734   Jentry2 = coloring->matentry2;
735   nrows   = coloring->nrows;
736   ncolors = coloring->ncolors;
737   bcols   = coloring->bcols;
738 
739   for (i = 0; i < ncolors; i += bcols) {
740     nrows_k = nrows[nbcols++];
741     for (l = 0; l < nrows_k; l++) {
742       row                      = Jentry2[nz].row; /* local row index */
743       *(Jentry2[nz++].valaddr) = y[row];
744     }
745     y += bcols * coloring->m;
746   }
747   PetscCall(MatAssemblyBegin(J, MAT_FINAL_ASSEMBLY));
748   PetscCall(MatAssemblyEnd(J, MAT_FINAL_ASSEMBLY));
749   PetscFunctionReturn(0);
750 }
751