xref: /petsc/src/mat/impls/dense/mpi/mpidense.c (revision 458b0db5a59538d1ac4b30bd659cdfc3dbce12fa)
1 /*
2    Basic functions for basic parallel dense matrices.
3    Portions of this code are under:
4    Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
5 */
6 
7 #include <../src/mat/impls/dense/mpi/mpidense.h> /*I   "petscmat.h"  I*/
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscblaslapack.h>
10 #include <petsc/private/vecimpl.h>
11 #include <petscdevice.h>
12 #include <petsc/private/deviceimpl.h>
13 
14 /*@
15   MatDenseGetLocalMatrix - For a `MATMPIDENSE` or `MATSEQDENSE` matrix returns the sequential
16   matrix that represents the operator. For sequential matrices it returns itself.
17 
18   Input Parameter:
19 . A - the sequential or MPI `MATDENSE` matrix
20 
21   Output Parameter:
22 . B - the inner matrix
23 
24   Level: intermediate
25 
26 .seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MATMPIDENSE`, `MATSEQDENSE`
27 @*/
28 PetscErrorCode MatDenseGetLocalMatrix(Mat A, Mat *B)
29 {
30   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
31   PetscBool     flg;
32 
33   PetscFunctionBegin;
34   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
35   PetscAssertPointer(B, 2);
36   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIDENSE, &flg));
37   if (flg) *B = mat->A;
38   else {
39     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQDENSE, &flg));
40     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for matrix type %s", ((PetscObject)A)->type_name);
41     *B = A;
42   }
43   PetscFunctionReturn(PETSC_SUCCESS);
44 }
45 
46 static PetscErrorCode MatCopy_MPIDense(Mat A, Mat B, MatStructure s)
47 {
48   Mat_MPIDense *Amat = (Mat_MPIDense *)A->data;
49   Mat_MPIDense *Bmat = (Mat_MPIDense *)B->data;
50 
51   PetscFunctionBegin;
52   PetscCall(MatCopy(Amat->A, Bmat->A, s));
53   PetscFunctionReturn(PETSC_SUCCESS);
54 }
55 
56 PetscErrorCode MatShift_MPIDense(Mat A, PetscScalar alpha)
57 {
58   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
59   PetscInt      j, lda, rstart = A->rmap->rstart, rend = A->rmap->rend, rend2;
60   PetscScalar  *v;
61 
62   PetscFunctionBegin;
63   PetscCall(MatDenseGetArray(mat->A, &v));
64   PetscCall(MatDenseGetLDA(mat->A, &lda));
65   rend2 = PetscMin(rend, A->cmap->N);
66   if (rend2 > rstart) {
67     for (j = rstart; j < rend2; j++) v[j - rstart + j * lda] += alpha;
68     PetscCall(PetscLogFlops(rend2 - rstart));
69   }
70   PetscCall(MatDenseRestoreArray(mat->A, &v));
71   PetscFunctionReturn(PETSC_SUCCESS);
72 }
73 
74 static PetscErrorCode MatGetRow_MPIDense(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
75 {
76   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
77   PetscInt      lrow, rstart = A->rmap->rstart, rend = A->rmap->rend;
78 
79   PetscFunctionBegin;
80   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "only local rows");
81   lrow = row - rstart;
82   PetscCall(MatGetRow(mat->A, lrow, nz, (const PetscInt **)idx, (const PetscScalar **)v));
83   PetscFunctionReturn(PETSC_SUCCESS);
84 }
85 
86 static PetscErrorCode MatRestoreRow_MPIDense(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
87 {
88   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
89   PetscInt      lrow, rstart = A->rmap->rstart, rend = A->rmap->rend;
90 
91   PetscFunctionBegin;
92   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "only local rows");
93   lrow = row - rstart;
94   PetscCall(MatRestoreRow(mat->A, lrow, nz, (const PetscInt **)idx, (const PetscScalar **)v));
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
98 static PetscErrorCode MatGetDiagonalBlock_MPIDense(Mat A, Mat *a)
99 {
100   Mat_MPIDense *mdn = (Mat_MPIDense *)A->data;
101   PetscInt      m = A->rmap->n, rstart = A->rmap->rstart;
102   PetscScalar  *array;
103   MPI_Comm      comm;
104   PetscBool     flg;
105   Mat           B;
106 
107   PetscFunctionBegin;
108   PetscCall(MatHasCongruentLayouts(A, &flg));
109   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only square matrices supported.");
110   PetscCall(PetscObjectQuery((PetscObject)A, "DiagonalBlock", (PetscObject *)&B));
111   if (!B) { /* This should use MatDenseGetSubMatrix (not create), but we would need a call like MatRestoreDiagonalBlock */
112 #if PetscDefined(HAVE_CUDA)
113     PetscCall(PetscObjectTypeCompare((PetscObject)mdn->A, MATSEQDENSECUDA, &flg));
114     PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not coded for %s. Send an email to petsc-dev@mcs.anl.gov to request this feature", MATSEQDENSECUDA);
115 #elif PetscDefined(HAVE_HIP)
116     PetscCall(PetscObjectTypeCompare((PetscObject)mdn->A, MATSEQDENSEHIP, &flg));
117     PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not coded for %s. Send an email to petsc-dev@mcs.anl.gov to request this feature", MATSEQDENSEHIP);
118 #endif
119     PetscCall(PetscObjectGetComm((PetscObject)mdn->A, &comm));
120     PetscCall(MatCreate(comm, &B));
121     PetscCall(MatSetSizes(B, m, m, m, m));
122     PetscCall(MatSetType(B, ((PetscObject)mdn->A)->type_name));
123     PetscCall(MatDenseGetArrayRead(mdn->A, (const PetscScalar **)&array));
124     PetscCall(MatSeqDenseSetPreallocation(B, array + m * rstart));
125     PetscCall(MatDenseRestoreArrayRead(mdn->A, (const PetscScalar **)&array));
126     PetscCall(PetscObjectCompose((PetscObject)A, "DiagonalBlock", (PetscObject)B));
127     *a = B;
128     PetscCall(MatDestroy(&B));
129   } else *a = B;
130   PetscFunctionReturn(PETSC_SUCCESS);
131 }
132 
133 static PetscErrorCode MatSetValues_MPIDense(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], const PetscScalar v[], InsertMode addv)
134 {
135   Mat_MPIDense *A = (Mat_MPIDense *)mat->data;
136   PetscInt      i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend, row;
137   PetscBool     roworiented = A->roworiented;
138 
139   PetscFunctionBegin;
140   for (i = 0; i < m; i++) {
141     if (idxm[i] < 0) continue;
142     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large");
143     if (idxm[i] >= rstart && idxm[i] < rend) {
144       row = idxm[i] - rstart;
145       if (roworiented) {
146         PetscCall(MatSetValues(A->A, 1, &row, n, idxn, PetscSafePointerPlusOffset(v, i * n), addv));
147       } else {
148         for (j = 0; j < n; j++) {
149           if (idxn[j] < 0) continue;
150           PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large");
151           PetscCall(MatSetValues(A->A, 1, &row, 1, &idxn[j], PetscSafePointerPlusOffset(v, i + j * m), addv));
152         }
153       }
154     } else if (!A->donotstash) {
155       mat->assembled = PETSC_FALSE;
156       if (roworiented) {
157         PetscCall(MatStashValuesRow_Private(&mat->stash, idxm[i], n, idxn, PetscSafePointerPlusOffset(v, i * n), PETSC_FALSE));
158       } else {
159         PetscCall(MatStashValuesCol_Private(&mat->stash, idxm[i], n, idxn, PetscSafePointerPlusOffset(v, i), m, PETSC_FALSE));
160       }
161     }
162   }
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
166 static PetscErrorCode MatGetValues_MPIDense(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
167 {
168   Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
169   PetscInt      i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend, row;
170 
171   PetscFunctionBegin;
172   for (i = 0; i < m; i++) {
173     if (idxm[i] < 0) continue; /* negative row */
174     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large");
175     PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported");
176     row = idxm[i] - rstart;
177     for (j = 0; j < n; j++) {
178       if (idxn[j] < 0) continue; /* negative column */
179       PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large");
180       PetscCall(MatGetValues(mdn->A, 1, &row, 1, &idxn[j], v + i * n + j));
181     }
182   }
183   PetscFunctionReturn(PETSC_SUCCESS);
184 }
185 
186 static PetscErrorCode MatDenseGetLDA_MPIDense(Mat A, PetscInt *lda)
187 {
188   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
189 
190   PetscFunctionBegin;
191   PetscCall(MatDenseGetLDA(a->A, lda));
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
195 static PetscErrorCode MatDenseSetLDA_MPIDense(Mat A, PetscInt lda)
196 {
197   Mat_MPIDense *a     = (Mat_MPIDense *)A->data;
198   MatType       mtype = MATSEQDENSE;
199 
200   PetscFunctionBegin;
201   if (!a->A) {
202     PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
203     PetscCall(PetscLayoutSetUp(A->rmap));
204     PetscCall(PetscLayoutSetUp(A->cmap));
205     PetscCall(MatCreate(PETSC_COMM_SELF, &a->A));
206     PetscCall(MatSetSizes(a->A, A->rmap->n, A->cmap->N, A->rmap->n, A->cmap->N));
207 #if PetscDefined(HAVE_CUDA)
208     PetscBool iscuda;
209     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIDENSECUDA, &iscuda));
210     if (iscuda) mtype = MATSEQDENSECUDA;
211 #elif PetscDefined(HAVE_HIP)
212     PetscBool iship;
213     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIDENSEHIP, &iship));
214     if (iship) mtype = MATSEQDENSEHIP;
215 #endif
216     PetscCall(MatSetType(a->A, mtype));
217   }
218   PetscCall(MatDenseSetLDA(a->A, lda));
219   PetscFunctionReturn(PETSC_SUCCESS);
220 }
221 
222 static PetscErrorCode MatDenseGetArray_MPIDense(Mat A, PetscScalar **array)
223 {
224   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
225 
226   PetscFunctionBegin;
227   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
228   PetscCall(MatDenseGetArray(a->A, array));
229   PetscFunctionReturn(PETSC_SUCCESS);
230 }
231 
232 static PetscErrorCode MatDenseGetArrayRead_MPIDense(Mat A, PetscScalar **array)
233 {
234   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
235 
236   PetscFunctionBegin;
237   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
238   PetscCall(MatDenseGetArrayRead(a->A, (const PetscScalar **)array));
239   PetscFunctionReturn(PETSC_SUCCESS);
240 }
241 
242 static PetscErrorCode MatDenseGetArrayWrite_MPIDense(Mat A, PetscScalar **array)
243 {
244   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
245 
246   PetscFunctionBegin;
247   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
248   PetscCall(MatDenseGetArrayWrite(a->A, array));
249   PetscFunctionReturn(PETSC_SUCCESS);
250 }
251 
252 static PetscErrorCode MatDensePlaceArray_MPIDense(Mat A, const PetscScalar *array)
253 {
254   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
255 
256   PetscFunctionBegin;
257   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
258   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
259   PetscCall(MatDensePlaceArray(a->A, array));
260   PetscFunctionReturn(PETSC_SUCCESS);
261 }
262 
263 static PetscErrorCode MatDenseResetArray_MPIDense(Mat A)
264 {
265   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
266 
267   PetscFunctionBegin;
268   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
269   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
270   PetscCall(MatDenseResetArray(a->A));
271   PetscFunctionReturn(PETSC_SUCCESS);
272 }
273 
274 static PetscErrorCode MatDenseReplaceArray_MPIDense(Mat A, const PetscScalar *array)
275 {
276   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
277 
278   PetscFunctionBegin;
279   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
280   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
281   PetscCall(MatDenseReplaceArray(a->A, array));
282   PetscFunctionReturn(PETSC_SUCCESS);
283 }
284 
285 static PetscErrorCode MatCreateSubMatrix_MPIDense(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
286 {
287   Mat_MPIDense      *mat = (Mat_MPIDense *)A->data, *newmatd;
288   PetscInt           lda, i, j, rstart, rend, nrows, ncols, Ncols, nlrows, nlcols;
289   const PetscInt    *irow, *icol;
290   const PetscScalar *v;
291   PetscScalar       *bv;
292   Mat                newmat;
293   IS                 iscol_local;
294   MPI_Comm           comm_is, comm_mat;
295 
296   PetscFunctionBegin;
297   PetscCall(PetscObjectGetComm((PetscObject)A, &comm_mat));
298   PetscCall(PetscObjectGetComm((PetscObject)iscol, &comm_is));
299   PetscCheck(comm_mat == comm_is, PETSC_COMM_SELF, PETSC_ERR_ARG_NOTSAMECOMM, "IS communicator must match matrix communicator");
300 
301   PetscCall(ISAllGather(iscol, &iscol_local));
302   PetscCall(ISGetIndices(isrow, &irow));
303   PetscCall(ISGetIndices(iscol_local, &icol));
304   PetscCall(ISGetLocalSize(isrow, &nrows));
305   PetscCall(ISGetLocalSize(iscol, &ncols));
306   PetscCall(ISGetSize(iscol, &Ncols)); /* global number of columns, size of iscol_local */
307 
308   /* No parallel redistribution currently supported! Should really check each index set
309      to confirm that it is OK.  ... Currently supports only submatrix same partitioning as
310      original matrix! */
311 
312   PetscCall(MatGetLocalSize(A, &nlrows, &nlcols));
313   PetscCall(MatGetOwnershipRange(A, &rstart, &rend));
314 
315   /* Check submatrix call */
316   if (scall == MAT_REUSE_MATRIX) {
317     /* SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Reused submatrix wrong size"); */
318     /* Really need to test rows and column sizes! */
319     newmat = *B;
320   } else {
321     /* Create and fill new matrix */
322     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &newmat));
323     PetscCall(MatSetSizes(newmat, nrows, ncols, PETSC_DECIDE, Ncols));
324     PetscCall(MatSetType(newmat, ((PetscObject)A)->type_name));
325     PetscCall(MatMPIDenseSetPreallocation(newmat, NULL));
326   }
327 
328   /* Now extract the data pointers and do the copy, column at a time */
329   newmatd = (Mat_MPIDense *)newmat->data;
330   PetscCall(MatDenseGetArray(newmatd->A, &bv));
331   PetscCall(MatDenseGetArrayRead(mat->A, &v));
332   PetscCall(MatDenseGetLDA(mat->A, &lda));
333   for (i = 0; i < Ncols; i++) {
334     const PetscScalar *av = v + lda * icol[i];
335     for (j = 0; j < nrows; j++) *bv++ = av[irow[j] - rstart];
336   }
337   PetscCall(MatDenseRestoreArrayRead(mat->A, &v));
338   PetscCall(MatDenseRestoreArray(newmatd->A, &bv));
339 
340   /* Assemble the matrices so that the correct flags are set */
341   PetscCall(MatAssemblyBegin(newmat, MAT_FINAL_ASSEMBLY));
342   PetscCall(MatAssemblyEnd(newmat, MAT_FINAL_ASSEMBLY));
343 
344   /* Free work space */
345   PetscCall(ISRestoreIndices(isrow, &irow));
346   PetscCall(ISRestoreIndices(iscol_local, &icol));
347   PetscCall(ISDestroy(&iscol_local));
348   *B = newmat;
349   PetscFunctionReturn(PETSC_SUCCESS);
350 }
351 
352 static PetscErrorCode MatDenseRestoreArray_MPIDense(Mat A, PetscScalar **array)
353 {
354   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
355 
356   PetscFunctionBegin;
357   PetscCall(MatDenseRestoreArray(a->A, array));
358   PetscFunctionReturn(PETSC_SUCCESS);
359 }
360 
361 static PetscErrorCode MatDenseRestoreArrayRead_MPIDense(Mat A, PetscScalar **array)
362 {
363   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
364 
365   PetscFunctionBegin;
366   PetscCall(MatDenseRestoreArrayRead(a->A, (const PetscScalar **)array));
367   PetscFunctionReturn(PETSC_SUCCESS);
368 }
369 
370 static PetscErrorCode MatDenseRestoreArrayWrite_MPIDense(Mat A, PetscScalar **array)
371 {
372   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
373 
374   PetscFunctionBegin;
375   PetscCall(MatDenseRestoreArrayWrite(a->A, array));
376   PetscFunctionReturn(PETSC_SUCCESS);
377 }
378 
379 static PetscErrorCode MatAssemblyBegin_MPIDense(Mat mat, MatAssemblyType mode)
380 {
381   Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
382   PetscInt      nstash, reallocs;
383 
384   PetscFunctionBegin;
385   if (mdn->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
386 
387   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
388   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
389   PetscCall(PetscInfo(mdn->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
390   PetscFunctionReturn(PETSC_SUCCESS);
391 }
392 
393 static PetscErrorCode MatAssemblyEnd_MPIDense(Mat mat, MatAssemblyType mode)
394 {
395   Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
396   PetscInt      i, *row, *col, flg, j, rstart, ncols;
397   PetscMPIInt   n;
398   PetscScalar  *val;
399 
400   PetscFunctionBegin;
401   if (!mdn->donotstash && !mat->nooffprocentries) {
402     /*  wait on receives */
403     while (1) {
404       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
405       if (!flg) break;
406 
407       for (i = 0; i < n;) {
408         /* Now identify the consecutive vals belonging to the same row */
409         for (j = i, rstart = row[j]; j < n; j++) {
410           if (row[j] != rstart) break;
411         }
412         if (j < n) ncols = j - i;
413         else ncols = n - i;
414         /* Now assemble all these values with a single function call */
415         PetscCall(MatSetValues_MPIDense(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
416         i = j;
417       }
418     }
419     PetscCall(MatStashScatterEnd_Private(&mat->stash));
420   }
421 
422   PetscCall(MatAssemblyBegin(mdn->A, mode));
423   PetscCall(MatAssemblyEnd(mdn->A, mode));
424   PetscFunctionReturn(PETSC_SUCCESS);
425 }
426 
427 static PetscErrorCode MatZeroEntries_MPIDense(Mat A)
428 {
429   Mat_MPIDense *l = (Mat_MPIDense *)A->data;
430 
431   PetscFunctionBegin;
432   PetscCall(MatZeroEntries(l->A));
433   PetscFunctionReturn(PETSC_SUCCESS);
434 }
435 
436 static PetscErrorCode MatZeroRows_MPIDense(Mat A, PetscInt n, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
437 {
438   Mat_MPIDense *l = (Mat_MPIDense *)A->data;
439   PetscInt      i, len, *lrows;
440 
441   PetscFunctionBegin;
442   /* get locally owned rows */
443   PetscCall(PetscLayoutMapLocal(A->rmap, n, rows, &len, &lrows, NULL));
444   /* fix right-hand side if needed */
445   if (x && b) {
446     const PetscScalar *xx;
447     PetscScalar       *bb;
448 
449     PetscCall(VecGetArrayRead(x, &xx));
450     PetscCall(VecGetArrayWrite(b, &bb));
451     for (i = 0; i < len; ++i) bb[lrows[i]] = diag * xx[lrows[i]];
452     PetscCall(VecRestoreArrayRead(x, &xx));
453     PetscCall(VecRestoreArrayWrite(b, &bb));
454   }
455   PetscCall(MatZeroRows(l->A, len, lrows, 0.0, NULL, NULL));
456   if (diag != 0.0) {
457     Vec d;
458 
459     PetscCall(MatCreateVecs(A, NULL, &d));
460     PetscCall(VecSet(d, diag));
461     PetscCall(MatDiagonalSet(A, d, INSERT_VALUES));
462     PetscCall(VecDestroy(&d));
463   }
464   PetscCall(PetscFree(lrows));
465   PetscFunctionReturn(PETSC_SUCCESS);
466 }
467 
468 PETSC_INTERN PetscErrorCode MatMult_SeqDense(Mat, Vec, Vec);
469 PETSC_INTERN PetscErrorCode MatMultAdd_SeqDense(Mat, Vec, Vec, Vec);
470 PETSC_INTERN PetscErrorCode MatMultTranspose_SeqDense(Mat, Vec, Vec);
471 PETSC_INTERN PetscErrorCode MatMultTransposeAdd_SeqDense(Mat, Vec, Vec, Vec);
472 
473 static PetscErrorCode MatMultColumnRange_MPIDense(Mat mat, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end)
474 {
475   Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
476   const PetscScalar *ax;
477   PetscScalar       *ay;
478   PetscMemType       axmtype, aymtype;
479 
480   PetscFunctionBegin;
481   if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
482   PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
483   PetscCall(VecGetArrayWriteAndMemType(mdn->lvec, &ay, &aymtype));
484   PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
485   PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
486   PetscCall(VecRestoreArrayWriteAndMemType(mdn->lvec, &ay));
487   PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
488   PetscUseMethod(mdn->A, "MatMultColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (mdn->A, mdn->lvec, yy, c_start, c_end));
489   PetscFunctionReturn(PETSC_SUCCESS);
490 }
491 
492 static PetscErrorCode MatMult_MPIDense(Mat mat, Vec xx, Vec yy)
493 {
494   Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
495   const PetscScalar *ax;
496   PetscScalar       *ay;
497   PetscMemType       axmtype, aymtype;
498 
499   PetscFunctionBegin;
500   if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
501   PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
502   PetscCall(VecGetArrayWriteAndMemType(mdn->lvec, &ay, &aymtype));
503   PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
504   PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
505   PetscCall(VecRestoreArrayWriteAndMemType(mdn->lvec, &ay));
506   PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
507   PetscCall((*mdn->A->ops->mult)(mdn->A, mdn->lvec, yy));
508   PetscFunctionReturn(PETSC_SUCCESS);
509 }
510 
511 static PetscErrorCode MatMultAddColumnRange_MPIDense(Mat mat, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end)
512 {
513   Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
514   const PetscScalar *ax;
515   PetscScalar       *ay;
516   PetscMemType       axmtype, aymtype;
517 
518   PetscFunctionBegin;
519   if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
520   PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
521   PetscCall(VecGetArrayAndMemType(mdn->lvec, &ay, &aymtype));
522   PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
523   PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
524   PetscCall(VecRestoreArrayAndMemType(mdn->lvec, &ay));
525   PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
526   PetscUseMethod(mdn->A, "MatMultAddColumnRange_C", (Mat, Vec, Vec, Vec, PetscInt, PetscInt), (mdn->A, mdn->lvec, yy, zz, c_start, c_end));
527   PetscFunctionReturn(PETSC_SUCCESS);
528 }
529 
530 static PetscErrorCode MatMultAdd_MPIDense(Mat mat, Vec xx, Vec yy, Vec zz)
531 {
532   Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
533   const PetscScalar *ax;
534   PetscScalar       *ay;
535   PetscMemType       axmtype, aymtype;
536 
537   PetscFunctionBegin;
538   if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
539   PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
540   PetscCall(VecGetArrayAndMemType(mdn->lvec, &ay, &aymtype));
541   PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
542   PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
543   PetscCall(VecRestoreArrayAndMemType(mdn->lvec, &ay));
544   PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
545   PetscCall((*mdn->A->ops->multadd)(mdn->A, mdn->lvec, yy, zz));
546   PetscFunctionReturn(PETSC_SUCCESS);
547 }
548 
549 static PetscErrorCode MatMultHermitianTransposeColumnRange_MPIDense(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end)
550 {
551   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
552   const PetscScalar *ax;
553   PetscScalar       *ay;
554   PetscMemType       axmtype, aymtype;
555   PetscInt           r_start, r_end;
556   PetscInt           c_start_local, c_end_local;
557 
558   PetscFunctionBegin;
559   if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
560   PetscCall(VecZeroEntries(a->lvec));
561   PetscCall(VecGetOwnershipRange(yy, &r_start, &r_end));
562   c_start_local = PetscMax(c_start, r_start);
563   c_end_local   = PetscMin(c_end, r_end);
564   PetscCall(VecGetArrayAndMemType(yy, &ay, &aymtype));
565   if (c_end_local > c_start_local) {
566     if (PetscMemTypeHost(aymtype)) {
567       PetscCall(PetscArrayzero(&ay[c_start_local], (size_t)(c_end_local - c_start_local)));
568     } else {
569       PetscCall(PetscDeviceRegisterMemory(ay, aymtype, sizeof(*ay) * ((size_t)(r_end - r_start))));
570       PetscCall(PetscDeviceArrayZero(NULL, &ay[c_start_local], (size_t)(c_end_local - c_start_local)));
571     }
572   }
573   PetscUseMethod(a->A, "MatMultHermitianTransposeColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (a->A, xx, a->lvec, c_start, c_end));
574   PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
575   PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
576   PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
577   PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
578   PetscCall(VecRestoreArrayAndMemType(yy, &ay));
579   PetscFunctionReturn(PETSC_SUCCESS);
580 }
581 
582 static PetscErrorCode MatMultTransposeKernel_MPIDense(Mat A, Vec xx, Vec yy, PetscBool herm)
583 {
584   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
585   const PetscScalar *ax;
586   PetscScalar       *ay;
587   PetscMemType       axmtype, aymtype;
588 
589   PetscFunctionBegin;
590   if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
591   PetscCall(VecSet(yy, 0.0));
592   if (herm) PetscCall((*a->A->ops->multhermitiantranspose)(a->A, xx, a->lvec));
593   else PetscCall((*a->A->ops->multtranspose)(a->A, xx, a->lvec));
594   PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
595   PetscCall(VecGetArrayAndMemType(yy, &ay, &aymtype));
596   PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
597   PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
598   PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
599   PetscCall(VecRestoreArrayAndMemType(yy, &ay));
600   PetscFunctionReturn(PETSC_SUCCESS);
601 }
602 
603 static PetscErrorCode MatMultHermitianTransposeAddColumnRange_MPIDense(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end)
604 {
605   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
606   const PetscScalar *ax;
607   PetscScalar       *ay;
608   PetscMemType       axmtype, aymtype;
609 
610   PetscFunctionBegin;
611   if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
612   PetscCall(VecCopy(yy, zz));
613   PetscCall(VecZeroEntries(a->lvec));
614   PetscUseMethod(a->A, "MatMultHermitianTransposeColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (a->A, xx, a->lvec, c_start, c_end));
615   PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
616   PetscCall(VecGetArrayAndMemType(zz, &ay, &aymtype));
617   PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
618   PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
619   PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
620   PetscCall(VecRestoreArrayAndMemType(zz, &ay));
621   PetscFunctionReturn(PETSC_SUCCESS);
622 }
623 
624 static PetscErrorCode MatMultTransposeAddKernel_MPIDense(Mat A, Vec xx, Vec yy, Vec zz, PetscBool herm)
625 {
626   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
627   const PetscScalar *ax;
628   PetscScalar       *ay;
629   PetscMemType       axmtype, aymtype;
630 
631   PetscFunctionBegin;
632   if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
633   PetscCall(VecCopy(yy, zz));
634   if (herm) PetscCall((*a->A->ops->multhermitiantranspose)(a->A, xx, a->lvec));
635   else PetscCall((*a->A->ops->multtranspose)(a->A, xx, a->lvec));
636   PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
637   PetscCall(VecGetArrayAndMemType(zz, &ay, &aymtype));
638   PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
639   PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
640   PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
641   PetscCall(VecRestoreArrayAndMemType(zz, &ay));
642   PetscFunctionReturn(PETSC_SUCCESS);
643 }
644 
645 static PetscErrorCode MatMultTranspose_MPIDense(Mat A, Vec xx, Vec yy)
646 {
647   PetscFunctionBegin;
648   PetscCall(MatMultTransposeKernel_MPIDense(A, xx, yy, PETSC_FALSE));
649   PetscFunctionReturn(PETSC_SUCCESS);
650 }
651 
652 static PetscErrorCode MatMultTransposeAdd_MPIDense(Mat A, Vec xx, Vec yy, Vec zz)
653 {
654   PetscFunctionBegin;
655   PetscCall(MatMultTransposeAddKernel_MPIDense(A, xx, yy, zz, PETSC_FALSE));
656   PetscFunctionReturn(PETSC_SUCCESS);
657 }
658 
659 static PetscErrorCode MatMultHermitianTranspose_MPIDense(Mat A, Vec xx, Vec yy)
660 {
661   PetscFunctionBegin;
662   PetscCall(MatMultTransposeKernel_MPIDense(A, xx, yy, PETSC_TRUE));
663   PetscFunctionReturn(PETSC_SUCCESS);
664 }
665 
666 static PetscErrorCode MatMultHermitianTransposeAdd_MPIDense(Mat A, Vec xx, Vec yy, Vec zz)
667 {
668   PetscFunctionBegin;
669   PetscCall(MatMultTransposeAddKernel_MPIDense(A, xx, yy, zz, PETSC_TRUE));
670   PetscFunctionReturn(PETSC_SUCCESS);
671 }
672 
673 PetscErrorCode MatGetDiagonal_MPIDense(Mat A, Vec v)
674 {
675   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
676   PetscInt           lda, len, i, nl, ng, m = A->rmap->n, radd;
677   PetscScalar       *x;
678   const PetscScalar *av;
679 
680   PetscFunctionBegin;
681   PetscCall(VecGetArray(v, &x));
682   PetscCall(VecGetSize(v, &ng));
683   PetscCheck(ng == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming mat and vec");
684   PetscCall(VecGetLocalSize(v, &nl));
685   len  = PetscMin(a->A->rmap->n, a->A->cmap->n);
686   radd = A->rmap->rstart * m;
687   PetscCall(MatDenseGetArrayRead(a->A, &av));
688   PetscCall(MatDenseGetLDA(a->A, &lda));
689   for (i = 0; i < len; i++) x[i] = av[radd + i * lda + i];
690   PetscCall(MatDenseRestoreArrayRead(a->A, &av));
691   if (nl - i > 0) PetscCall(PetscArrayzero(x + i, nl - i));
692   PetscCall(VecRestoreArray(v, &x));
693   PetscFunctionReturn(PETSC_SUCCESS);
694 }
695 
696 static PetscErrorCode MatDestroy_MPIDense(Mat mat)
697 {
698   Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
699 
700   PetscFunctionBegin;
701   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
702   PetscCall(MatStashDestroy_Private(&mat->stash));
703   PetscCheck(!mdn->vecinuse, PetscObjectComm((PetscObject)mat), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
704   PetscCheck(!mdn->matinuse, PetscObjectComm((PetscObject)mat), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
705   PetscCall(MatDestroy(&mdn->A));
706   PetscCall(VecDestroy(&mdn->lvec));
707   PetscCall(PetscSFDestroy(&mdn->Mvctx));
708   PetscCall(VecDestroy(&mdn->cvec));
709   PetscCall(MatDestroy(&mdn->cmat));
710 
711   PetscCall(PetscFree(mat->data));
712   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
713 
714   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetLDA_C", NULL));
715   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseSetLDA_C", NULL));
716   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArray_C", NULL));
717   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArray_C", NULL));
718   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayRead_C", NULL));
719   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayRead_C", NULL));
720   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayWrite_C", NULL));
721   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayWrite_C", NULL));
722   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDensePlaceArray_C", NULL));
723   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseResetArray_C", NULL));
724   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseReplaceArray_C", NULL));
725   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
726   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpiaij_C", NULL));
727 #if defined(PETSC_HAVE_ELEMENTAL)
728   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_elemental_C", NULL));
729 #endif
730 #if defined(PETSC_HAVE_SCALAPACK)
731   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_scalapack_C", NULL));
732 #endif
733   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIDenseSetPreallocation_C", NULL));
734   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidense_C", NULL));
735   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaij_C", NULL));
736 #if defined(PETSC_HAVE_CUDA)
737   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijcusparse_mpidense_C", NULL));
738   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijcusparse_C", NULL));
739   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensecuda_C", NULL));
740   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidensecuda_mpidense_C", NULL));
741   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidensecuda_C", NULL));
742   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijcusparse_mpidensecuda_C", NULL));
743   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensecuda_mpiaij_C", NULL));
744   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensecuda_mpiaijcusparse_C", NULL));
745   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAGetArray_C", NULL));
746   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAGetArrayRead_C", NULL));
747   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAGetArrayWrite_C", NULL));
748   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDARestoreArray_C", NULL));
749   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDARestoreArrayRead_C", NULL));
750   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDARestoreArrayWrite_C", NULL));
751   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAPlaceArray_C", NULL));
752   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAResetArray_C", NULL));
753   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAReplaceArray_C", NULL));
754   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDASetPreallocation_C", NULL));
755 #endif
756 #if defined(PETSC_HAVE_HIP)
757   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijhipsparse_mpidense_C", NULL));
758   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijhipsparse_C", NULL));
759   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensehip_C", NULL));
760   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidensehip_mpidense_C", NULL));
761   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidensehip_C", NULL));
762   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijhipsparse_mpidensehip_C", NULL));
763   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensehip_mpiaij_C", NULL));
764   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensehip_mpiaijhipsparse_C", NULL));
765   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPGetArray_C", NULL));
766   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPGetArrayRead_C", NULL));
767   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPGetArrayWrite_C", NULL));
768   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPRestoreArray_C", NULL));
769   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPRestoreArrayRead_C", NULL));
770   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPRestoreArrayWrite_C", NULL));
771   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPPlaceArray_C", NULL));
772   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPResetArray_C", NULL));
773   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPReplaceArray_C", NULL));
774   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPSetPreallocation_C", NULL));
775 #endif
776   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumn_C", NULL));
777   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumn_C", NULL));
778   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVec_C", NULL));
779   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVec_C", NULL));
780   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecRead_C", NULL));
781   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecRead_C", NULL));
782   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecWrite_C", NULL));
783   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecWrite_C", NULL));
784   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetSubMatrix_C", NULL));
785   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreSubMatrix_C", NULL));
786   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultColumnRange_C", NULL));
787   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultAddColumnRange_C", NULL));
788   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeColumnRange_C", NULL));
789   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeAddColumnRange_C", NULL));
790 
791   PetscCall(PetscObjectCompose((PetscObject)mat, "DiagonalBlock", NULL));
792   PetscFunctionReturn(PETSC_SUCCESS);
793 }
794 
795 #include <petscdraw.h>
796 static PetscErrorCode MatView_MPIDense_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
797 {
798   Mat_MPIDense     *mdn = (Mat_MPIDense *)mat->data;
799   PetscMPIInt       rank;
800   PetscViewerType   vtype;
801   PetscBool         isascii, isdraw;
802   PetscViewer       sviewer;
803   PetscViewerFormat format;
804 
805   PetscFunctionBegin;
806   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
807   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
808   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
809   if (isascii) {
810     PetscCall(PetscViewerGetType(viewer, &vtype));
811     PetscCall(PetscViewerGetFormat(viewer, &format));
812     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
813       MatInfo info;
814       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
815       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
816       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "  [%d] local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %" PetscInt_FMT " \n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
817                                                    (PetscInt)info.memory));
818       PetscCall(PetscViewerFlush(viewer));
819       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
820       if (mdn->Mvctx) PetscCall(PetscSFView(mdn->Mvctx, viewer));
821       PetscFunctionReturn(PETSC_SUCCESS);
822     } else if (format == PETSC_VIEWER_ASCII_INFO) {
823       PetscFunctionReturn(PETSC_SUCCESS);
824     }
825   } else if (isdraw) {
826     PetscDraw draw;
827     PetscBool isnull;
828 
829     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
830     PetscCall(PetscDrawIsNull(draw, &isnull));
831     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
832   }
833 
834   {
835     /* assemble the entire matrix onto first processor. */
836     Mat          A;
837     PetscInt     M = mat->rmap->N, N = mat->cmap->N, m, row, i, nz;
838     PetscInt    *cols;
839     PetscScalar *vals;
840 
841     PetscCall(MatCreate(PetscObjectComm((PetscObject)mat), &A));
842     if (rank == 0) {
843       PetscCall(MatSetSizes(A, M, N, M, N));
844     } else {
845       PetscCall(MatSetSizes(A, 0, 0, M, N));
846     }
847     /* Since this is a temporary matrix, MATMPIDENSE instead of ((PetscObject)A)->type_name here is probably acceptable. */
848     PetscCall(MatSetType(A, MATMPIDENSE));
849     PetscCall(MatMPIDenseSetPreallocation(A, NULL));
850 
851     /* Copy the matrix ... This isn't the most efficient means,
852        but it's quick for now */
853     A->insertmode = INSERT_VALUES;
854 
855     row = mat->rmap->rstart;
856     m   = mdn->A->rmap->n;
857     for (i = 0; i < m; i++) {
858       PetscCall(MatGetRow_MPIDense(mat, row, &nz, &cols, &vals));
859       PetscCall(MatSetValues_MPIDense(A, 1, &row, nz, cols, vals, INSERT_VALUES));
860       PetscCall(MatRestoreRow_MPIDense(mat, row, &nz, &cols, &vals));
861       row++;
862     }
863 
864     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
865     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
866     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
867     if (rank == 0) {
868       PetscCall(PetscObjectSetName((PetscObject)((Mat_MPIDense *)A->data)->A, ((PetscObject)mat)->name));
869       PetscCall(MatView_SeqDense(((Mat_MPIDense *)A->data)->A, sviewer));
870     }
871     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
872     PetscCall(MatDestroy(&A));
873   }
874   PetscFunctionReturn(PETSC_SUCCESS);
875 }
876 
877 static PetscErrorCode MatView_MPIDense(Mat mat, PetscViewer viewer)
878 {
879   PetscBool isascii, isbinary, isdraw, issocket;
880 
881   PetscFunctionBegin;
882   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
883   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
884   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
885   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
886 
887   if (isascii || issocket || isdraw) PetscCall(MatView_MPIDense_ASCIIorDraworSocket(mat, viewer));
888   else if (isbinary) PetscCall(MatView_Dense_Binary(mat, viewer));
889   PetscFunctionReturn(PETSC_SUCCESS);
890 }
891 
892 static PetscErrorCode MatGetInfo_MPIDense(Mat A, MatInfoType flag, MatInfo *info)
893 {
894   Mat_MPIDense  *mat = (Mat_MPIDense *)A->data;
895   Mat            mdn = mat->A;
896   PetscLogDouble isend[5], irecv[5];
897 
898   PetscFunctionBegin;
899   info->block_size = 1.0;
900 
901   PetscCall(MatGetInfo(mdn, MAT_LOCAL, info));
902 
903   isend[0] = info->nz_used;
904   isend[1] = info->nz_allocated;
905   isend[2] = info->nz_unneeded;
906   isend[3] = info->memory;
907   isend[4] = info->mallocs;
908   if (flag == MAT_LOCAL) {
909     info->nz_used      = isend[0];
910     info->nz_allocated = isend[1];
911     info->nz_unneeded  = isend[2];
912     info->memory       = isend[3];
913     info->mallocs      = isend[4];
914   } else if (flag == MAT_GLOBAL_MAX) {
915     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)A)));
916 
917     info->nz_used      = irecv[0];
918     info->nz_allocated = irecv[1];
919     info->nz_unneeded  = irecv[2];
920     info->memory       = irecv[3];
921     info->mallocs      = irecv[4];
922   } else if (flag == MAT_GLOBAL_SUM) {
923     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)A)));
924 
925     info->nz_used      = irecv[0];
926     info->nz_allocated = irecv[1];
927     info->nz_unneeded  = irecv[2];
928     info->memory       = irecv[3];
929     info->mallocs      = irecv[4];
930   }
931   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
932   info->fill_ratio_needed = 0;
933   info->factor_mallocs    = 0;
934   PetscFunctionReturn(PETSC_SUCCESS);
935 }
936 
937 static PetscErrorCode MatSetOption_MPIDense(Mat A, MatOption op, PetscBool flg)
938 {
939   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
940 
941   PetscFunctionBegin;
942   switch (op) {
943   case MAT_NEW_NONZERO_LOCATIONS:
944   case MAT_NEW_NONZERO_LOCATION_ERR:
945   case MAT_NEW_NONZERO_ALLOCATION_ERR:
946     MatCheckPreallocated(A, 1);
947     PetscCall(MatSetOption(a->A, op, flg));
948     break;
949   case MAT_ROW_ORIENTED:
950     MatCheckPreallocated(A, 1);
951     a->roworiented = flg;
952     PetscCall(MatSetOption(a->A, op, flg));
953     break;
954   case MAT_IGNORE_OFF_PROC_ENTRIES:
955     a->donotstash = flg;
956     break;
957   case MAT_SYMMETRIC:
958   case MAT_STRUCTURALLY_SYMMETRIC:
959   case MAT_HERMITIAN:
960   case MAT_SYMMETRY_ETERNAL:
961   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
962   case MAT_SPD:
963   case MAT_SPD_ETERNAL:
964     /* if the diagonal matrix is square it inherits some of the properties above */
965     if (a->A && A->rmap->n == A->cmap->n) PetscCall(MatSetOption(a->A, op, flg));
966     break;
967   default:
968     break;
969   }
970   PetscFunctionReturn(PETSC_SUCCESS);
971 }
972 
973 static PetscErrorCode MatDiagonalScale_MPIDense(Mat A, Vec ll, Vec rr)
974 {
975   Mat_MPIDense      *mdn = (Mat_MPIDense *)A->data;
976   const PetscScalar *l;
977   PetscScalar        x, *v, *vv, *r;
978   PetscInt           i, j, s2a, s3a, s2, s3, m = mdn->A->rmap->n, n = mdn->A->cmap->n, lda;
979 
980   PetscFunctionBegin;
981   PetscCall(MatDenseGetArray(mdn->A, &vv));
982   PetscCall(MatDenseGetLDA(mdn->A, &lda));
983   PetscCall(MatGetLocalSize(A, &s2, &s3));
984   if (ll) {
985     PetscCall(VecGetLocalSize(ll, &s2a));
986     PetscCheck(s2a == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector non-conforming local size, %" PetscInt_FMT " != %" PetscInt_FMT, s2a, s2);
987     PetscCall(VecGetArrayRead(ll, &l));
988     for (i = 0; i < m; i++) {
989       x = l[i];
990       v = vv + i;
991       for (j = 0; j < n; j++) {
992         (*v) *= x;
993         v += lda;
994       }
995     }
996     PetscCall(VecRestoreArrayRead(ll, &l));
997     PetscCall(PetscLogFlops(1.0 * n * m));
998   }
999   if (rr) {
1000     const PetscScalar *ar;
1001 
1002     PetscCall(VecGetLocalSize(rr, &s3a));
1003     PetscCheck(s3a == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vec non-conforming local size, %" PetscInt_FMT " != %" PetscInt_FMT ".", s3a, s3);
1004     PetscCall(VecGetArrayRead(rr, &ar));
1005     if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
1006     PetscCall(VecGetArray(mdn->lvec, &r));
1007     PetscCall(PetscSFBcastBegin(mdn->Mvctx, MPIU_SCALAR, ar, r, MPI_REPLACE));
1008     PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ar, r, MPI_REPLACE));
1009     PetscCall(VecRestoreArrayRead(rr, &ar));
1010     for (i = 0; i < n; i++) {
1011       x = r[i];
1012       v = vv + i * lda;
1013       for (j = 0; j < m; j++) (*v++) *= x;
1014     }
1015     PetscCall(VecRestoreArray(mdn->lvec, &r));
1016     PetscCall(PetscLogFlops(1.0 * n * m));
1017   }
1018   PetscCall(MatDenseRestoreArray(mdn->A, &vv));
1019   PetscFunctionReturn(PETSC_SUCCESS);
1020 }
1021 
1022 static PetscErrorCode MatNorm_MPIDense(Mat A, NormType type, PetscReal *nrm)
1023 {
1024   Mat_MPIDense      *mdn = (Mat_MPIDense *)A->data;
1025   PetscInt           i, j;
1026   PetscMPIInt        size;
1027   PetscReal          sum = 0.0;
1028   const PetscScalar *av, *v;
1029 
1030   PetscFunctionBegin;
1031   PetscCall(MatDenseGetArrayRead(mdn->A, &av));
1032   v = av;
1033   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1034   if (size == 1) {
1035     PetscCall(MatNorm(mdn->A, type, nrm));
1036   } else {
1037     if (type == NORM_FROBENIUS) {
1038       for (i = 0; i < mdn->A->cmap->n * mdn->A->rmap->n; i++) {
1039         sum += PetscRealPart(PetscConj(*v) * (*v));
1040         v++;
1041       }
1042       PetscCallMPI(MPIU_Allreduce(&sum, nrm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1043       *nrm = PetscSqrtReal(*nrm);
1044       PetscCall(PetscLogFlops(2.0 * mdn->A->cmap->n * mdn->A->rmap->n));
1045     } else if (type == NORM_1) {
1046       PetscReal *tmp;
1047 
1048       PetscCall(PetscCalloc1(A->cmap->N, &tmp));
1049       *nrm = 0.0;
1050       v    = av;
1051       for (j = 0; j < mdn->A->cmap->n; j++) {
1052         for (i = 0; i < mdn->A->rmap->n; i++) {
1053           tmp[j] += PetscAbsScalar(*v);
1054           v++;
1055         }
1056       }
1057       PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, tmp, A->cmap->N, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1058       for (j = 0; j < A->cmap->N; j++) {
1059         if (tmp[j] > *nrm) *nrm = tmp[j];
1060       }
1061       PetscCall(PetscFree(tmp));
1062       PetscCall(PetscLogFlops(A->cmap->n * A->rmap->n));
1063     } else if (type == NORM_INFINITY) { /* max row norm */
1064       PetscCall(MatNorm(mdn->A, type, nrm));
1065       PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, nrm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
1066     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "No support for two norm");
1067   }
1068   PetscCall(MatDenseRestoreArrayRead(mdn->A, &av));
1069   PetscFunctionReturn(PETSC_SUCCESS);
1070 }
1071 
1072 static PetscErrorCode MatTranspose_MPIDense(Mat A, MatReuse reuse, Mat *matout)
1073 {
1074   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1075   Mat           B;
1076   PetscInt      M = A->rmap->N, N = A->cmap->N, m, n, *rwork, rstart = A->rmap->rstart;
1077   PetscInt      j, i, lda;
1078   PetscScalar  *v;
1079 
1080   PetscFunctionBegin;
1081   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1082   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
1083     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1084     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1085     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1086     PetscCall(MatMPIDenseSetPreallocation(B, NULL));
1087   } else B = *matout;
1088 
1089   m = a->A->rmap->n;
1090   n = a->A->cmap->n;
1091   PetscCall(MatDenseGetArrayRead(a->A, (const PetscScalar **)&v));
1092   PetscCall(MatDenseGetLDA(a->A, &lda));
1093   PetscCall(PetscMalloc1(m, &rwork));
1094   for (i = 0; i < m; i++) rwork[i] = rstart + i;
1095   for (j = 0; j < n; j++) {
1096     PetscCall(MatSetValues(B, 1, &j, m, rwork, v, INSERT_VALUES));
1097     v = PetscSafePointerPlusOffset(v, lda);
1098   }
1099   PetscCall(MatDenseRestoreArrayRead(a->A, (const PetscScalar **)&v));
1100   PetscCall(PetscFree(rwork));
1101   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1102   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1103   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1104     *matout = B;
1105   } else {
1106     PetscCall(MatHeaderMerge(A, &B));
1107   }
1108   PetscFunctionReturn(PETSC_SUCCESS);
1109 }
1110 
1111 static PetscErrorCode       MatDuplicate_MPIDense(Mat, MatDuplicateOption, Mat *);
1112 PETSC_INTERN PetscErrorCode MatScale_MPIDense(Mat, PetscScalar);
1113 
1114 static PetscErrorCode MatSetUp_MPIDense(Mat A)
1115 {
1116   PetscFunctionBegin;
1117   PetscCall(PetscLayoutSetUp(A->rmap));
1118   PetscCall(PetscLayoutSetUp(A->cmap));
1119   if (!A->preallocated) PetscCall(MatMPIDenseSetPreallocation(A, NULL));
1120   PetscFunctionReturn(PETSC_SUCCESS);
1121 }
1122 
1123 static PetscErrorCode MatAXPY_MPIDense(Mat Y, PetscScalar alpha, Mat X, MatStructure str)
1124 {
1125   Mat_MPIDense *A = (Mat_MPIDense *)Y->data, *B = (Mat_MPIDense *)X->data;
1126 
1127   PetscFunctionBegin;
1128   PetscCall(MatAXPY(A->A, alpha, B->A, str));
1129   PetscFunctionReturn(PETSC_SUCCESS);
1130 }
1131 
1132 static PetscErrorCode MatConjugate_MPIDense(Mat mat)
1133 {
1134   Mat_MPIDense *a = (Mat_MPIDense *)mat->data;
1135 
1136   PetscFunctionBegin;
1137   PetscCall(MatConjugate(a->A));
1138   PetscFunctionReturn(PETSC_SUCCESS);
1139 }
1140 
1141 static PetscErrorCode MatRealPart_MPIDense(Mat A)
1142 {
1143   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1144 
1145   PetscFunctionBegin;
1146   PetscCall(MatRealPart(a->A));
1147   PetscFunctionReturn(PETSC_SUCCESS);
1148 }
1149 
1150 static PetscErrorCode MatImaginaryPart_MPIDense(Mat A)
1151 {
1152   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1153 
1154   PetscFunctionBegin;
1155   PetscCall(MatImaginaryPart(a->A));
1156   PetscFunctionReturn(PETSC_SUCCESS);
1157 }
1158 
1159 static PetscErrorCode MatGetColumnVector_MPIDense(Mat A, Vec v, PetscInt col)
1160 {
1161   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1162 
1163   PetscFunctionBegin;
1164   PetscCheck(a->A, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Missing local matrix");
1165   PetscCheck(a->A->ops->getcolumnvector, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Missing get column operation");
1166   PetscCall((*a->A->ops->getcolumnvector)(a->A, v, col));
1167   PetscFunctionReturn(PETSC_SUCCESS);
1168 }
1169 
1170 PETSC_INTERN PetscErrorCode MatGetColumnReductions_SeqDense(Mat, PetscInt, PetscReal *);
1171 
1172 static PetscErrorCode MatGetColumnReductions_MPIDense(Mat A, PetscInt type, PetscReal *reductions)
1173 {
1174   PetscInt      i, m, n;
1175   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1176 
1177   PetscFunctionBegin;
1178   PetscCall(MatGetSize(A, &m, &n));
1179   if (type == REDUCTION_MEAN_REALPART) {
1180     PetscCall(MatGetColumnReductions_SeqDense(a->A, (PetscInt)REDUCTION_SUM_REALPART, reductions));
1181   } else if (type == REDUCTION_MEAN_IMAGINARYPART) {
1182     PetscCall(MatGetColumnReductions_SeqDense(a->A, (PetscInt)REDUCTION_SUM_IMAGINARYPART, reductions));
1183   } else {
1184     PetscCall(MatGetColumnReductions_SeqDense(a->A, type, reductions));
1185   }
1186   if (type == NORM_2) {
1187     for (i = 0; i < n; i++) reductions[i] *= reductions[i];
1188   }
1189   if (type == NORM_INFINITY) {
1190     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, reductions, n, MPIU_REAL, MPIU_MAX, A->hdr.comm));
1191   } else {
1192     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, reductions, n, MPIU_REAL, MPIU_SUM, A->hdr.comm));
1193   }
1194   if (type == NORM_2) {
1195     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
1196   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
1197     for (i = 0; i < n; i++) reductions[i] /= m;
1198   }
1199   PetscFunctionReturn(PETSC_SUCCESS);
1200 }
1201 
1202 static PetscErrorCode MatSetRandom_MPIDense(Mat x, PetscRandom rctx)
1203 {
1204   Mat_MPIDense *d = (Mat_MPIDense *)x->data;
1205 
1206   PetscFunctionBegin;
1207   PetscCall(MatSetRandom(d->A, rctx));
1208 #if defined(PETSC_HAVE_DEVICE)
1209   x->offloadmask = d->A->offloadmask;
1210 #endif
1211   PetscFunctionReturn(PETSC_SUCCESS);
1212 }
1213 
1214 static PetscErrorCode MatMissingDiagonal_MPIDense(Mat A, PetscBool *missing, PetscInt *d)
1215 {
1216   PetscFunctionBegin;
1217   *missing = PETSC_FALSE;
1218   PetscFunctionReturn(PETSC_SUCCESS);
1219 }
1220 
1221 static PetscErrorCode MatMatTransposeMultSymbolic_MPIDense_MPIDense(Mat, Mat, PetscReal, Mat);
1222 static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense(Mat, Mat, Mat);
1223 static PetscErrorCode MatTransposeMatMultSymbolic_MPIDense_MPIDense(Mat, Mat, PetscReal, Mat);
1224 static PetscErrorCode MatTransposeMatMultNumeric_MPIDense_MPIDense(Mat, Mat, Mat);
1225 static PetscErrorCode MatEqual_MPIDense(Mat, Mat, PetscBool *);
1226 static PetscErrorCode MatLoad_MPIDense(Mat, PetscViewer);
1227 static PetscErrorCode MatProductSetFromOptions_MPIDense(Mat);
1228 
1229 static struct _MatOps MatOps_Values = {MatSetValues_MPIDense,
1230                                        MatGetRow_MPIDense,
1231                                        MatRestoreRow_MPIDense,
1232                                        MatMult_MPIDense,
1233                                        /*  4*/ MatMultAdd_MPIDense,
1234                                        MatMultTranspose_MPIDense,
1235                                        MatMultTransposeAdd_MPIDense,
1236                                        NULL,
1237                                        NULL,
1238                                        NULL,
1239                                        /* 10*/ NULL,
1240                                        NULL,
1241                                        NULL,
1242                                        NULL,
1243                                        MatTranspose_MPIDense,
1244                                        /* 15*/ MatGetInfo_MPIDense,
1245                                        MatEqual_MPIDense,
1246                                        MatGetDiagonal_MPIDense,
1247                                        MatDiagonalScale_MPIDense,
1248                                        MatNorm_MPIDense,
1249                                        /* 20*/ MatAssemblyBegin_MPIDense,
1250                                        MatAssemblyEnd_MPIDense,
1251                                        MatSetOption_MPIDense,
1252                                        MatZeroEntries_MPIDense,
1253                                        /* 24*/ MatZeroRows_MPIDense,
1254                                        NULL,
1255                                        NULL,
1256                                        NULL,
1257                                        NULL,
1258                                        /* 29*/ MatSetUp_MPIDense,
1259                                        NULL,
1260                                        NULL,
1261                                        MatGetDiagonalBlock_MPIDense,
1262                                        NULL,
1263                                        /* 34*/ MatDuplicate_MPIDense,
1264                                        NULL,
1265                                        NULL,
1266                                        NULL,
1267                                        NULL,
1268                                        /* 39*/ MatAXPY_MPIDense,
1269                                        MatCreateSubMatrices_MPIDense,
1270                                        NULL,
1271                                        MatGetValues_MPIDense,
1272                                        MatCopy_MPIDense,
1273                                        /* 44*/ NULL,
1274                                        MatScale_MPIDense,
1275                                        MatShift_MPIDense,
1276                                        NULL,
1277                                        NULL,
1278                                        /* 49*/ MatSetRandom_MPIDense,
1279                                        NULL,
1280                                        NULL,
1281                                        NULL,
1282                                        NULL,
1283                                        /* 54*/ NULL,
1284                                        NULL,
1285                                        NULL,
1286                                        NULL,
1287                                        NULL,
1288                                        /* 59*/ MatCreateSubMatrix_MPIDense,
1289                                        MatDestroy_MPIDense,
1290                                        MatView_MPIDense,
1291                                        NULL,
1292                                        NULL,
1293                                        /* 64*/ NULL,
1294                                        NULL,
1295                                        NULL,
1296                                        NULL,
1297                                        NULL,
1298                                        /* 69*/ NULL,
1299                                        NULL,
1300                                        NULL,
1301                                        NULL,
1302                                        NULL,
1303                                        /* 74*/ NULL,
1304                                        NULL,
1305                                        NULL,
1306                                        NULL,
1307                                        MatLoad_MPIDense,
1308                                        /* 79*/ NULL,
1309                                        NULL,
1310                                        NULL,
1311                                        NULL,
1312                                        /* 83*/ NULL,
1313                                        NULL,
1314                                        NULL,
1315                                        NULL,
1316                                        MatMatTransposeMultSymbolic_MPIDense_MPIDense,
1317                                        MatMatTransposeMultNumeric_MPIDense_MPIDense,
1318                                        /* 89*/ NULL,
1319                                        MatProductSetFromOptions_MPIDense,
1320                                        NULL,
1321                                        NULL,
1322                                        MatConjugate_MPIDense,
1323                                        /* 94*/ NULL,
1324                                        NULL,
1325                                        MatRealPart_MPIDense,
1326                                        MatImaginaryPart_MPIDense,
1327                                        NULL,
1328                                        /*99*/ NULL,
1329                                        NULL,
1330                                        NULL,
1331                                        NULL,
1332                                        MatGetColumnVector_MPIDense,
1333                                        /*104*/ MatMissingDiagonal_MPIDense,
1334                                        NULL,
1335                                        NULL,
1336                                        NULL,
1337                                        NULL,
1338                                        /*109*/ NULL,
1339                                        NULL,
1340                                        NULL,
1341                                        MatMultHermitianTranspose_MPIDense,
1342                                        MatMultHermitianTransposeAdd_MPIDense,
1343                                        /*114*/ NULL,
1344                                        NULL,
1345                                        MatGetColumnReductions_MPIDense,
1346                                        NULL,
1347                                        NULL,
1348                                        NULL,
1349                                        /*120*/ NULL,
1350                                        MatTransposeMatMultSymbolic_MPIDense_MPIDense,
1351                                        MatTransposeMatMultNumeric_MPIDense_MPIDense,
1352                                        NULL,
1353                                        /*124*/ NULL,
1354                                        NULL,
1355                                        NULL,
1356                                        NULL,
1357                                        NULL,
1358                                        /*129*/ NULL,
1359                                        NULL,
1360                                        MatCreateMPIMatConcatenateSeqMat_MPIDense,
1361                                        NULL,
1362                                        NULL,
1363                                        /*134*/ NULL,
1364                                        NULL,
1365                                        NULL,
1366                                        NULL,
1367                                        NULL,
1368                                        /*139*/ NULL,
1369                                        NULL,
1370                                        NULL,
1371                                        NULL,
1372                                        NULL};
1373 
1374 static PetscErrorCode MatMPIDenseSetPreallocation_MPIDense(Mat mat, PetscScalar *data)
1375 {
1376   Mat_MPIDense *a     = (Mat_MPIDense *)mat->data;
1377   MatType       mtype = MATSEQDENSE;
1378 
1379   PetscFunctionBegin;
1380   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)mat), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1381   PetscCall(PetscLayoutSetUp(mat->rmap));
1382   PetscCall(PetscLayoutSetUp(mat->cmap));
1383   if (!a->A) {
1384     PetscCall(MatCreate(PETSC_COMM_SELF, &a->A));
1385     PetscCall(MatSetSizes(a->A, mat->rmap->n, mat->cmap->N, mat->rmap->n, mat->cmap->N));
1386   }
1387 #if defined(PETSC_HAVE_CUDA)
1388   PetscBool iscuda;
1389   PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATMPIDENSECUDA, &iscuda));
1390   if (iscuda) mtype = MATSEQDENSECUDA;
1391 #endif
1392 #if defined(PETSC_HAVE_HIP)
1393   PetscBool iship;
1394   PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATMPIDENSEHIP, &iship));
1395   if (iship) mtype = MATSEQDENSEHIP;
1396 #endif
1397   PetscCall(MatSetType(a->A, mtype));
1398   PetscCall(MatSeqDenseSetPreallocation(a->A, data));
1399 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
1400   mat->offloadmask = a->A->offloadmask;
1401 #endif
1402   mat->preallocated = PETSC_TRUE;
1403   mat->assembled    = PETSC_TRUE;
1404   PetscFunctionReturn(PETSC_SUCCESS);
1405 }
1406 
1407 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIDense(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
1408 {
1409   Mat B, C;
1410 
1411   PetscFunctionBegin;
1412   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &C));
1413   PetscCall(MatConvert_SeqAIJ_SeqDense(C, MATSEQDENSE, MAT_INITIAL_MATRIX, &B));
1414   PetscCall(MatDestroy(&C));
1415   if (reuse == MAT_REUSE_MATRIX) {
1416     C = *newmat;
1417   } else C = NULL;
1418   PetscCall(MatCreateMPIMatConcatenateSeqMat(PetscObjectComm((PetscObject)A), B, A->cmap->n, !C ? MAT_INITIAL_MATRIX : MAT_REUSE_MATRIX, &C));
1419   PetscCall(MatDestroy(&B));
1420   if (reuse == MAT_INPLACE_MATRIX) PetscCall(MatHeaderReplace(A, &C));
1421   else if (reuse == MAT_INITIAL_MATRIX) *newmat = C;
1422   PetscFunctionReturn(PETSC_SUCCESS);
1423 }
1424 
1425 static PetscErrorCode MatConvert_MPIDense_MPIAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
1426 {
1427   Mat B, C;
1428 
1429   PetscFunctionBegin;
1430   PetscCall(MatDenseGetLocalMatrix(A, &C));
1431   PetscCall(MatConvert_SeqDense_SeqAIJ(C, MATSEQAIJ, MAT_INITIAL_MATRIX, &B));
1432   if (reuse == MAT_REUSE_MATRIX) {
1433     C = *newmat;
1434   } else C = NULL;
1435   PetscCall(MatCreateMPIMatConcatenateSeqMat(PetscObjectComm((PetscObject)A), B, A->cmap->n, !C ? MAT_INITIAL_MATRIX : MAT_REUSE_MATRIX, &C));
1436   PetscCall(MatDestroy(&B));
1437   if (reuse == MAT_INPLACE_MATRIX) PetscCall(MatHeaderReplace(A, &C));
1438   else if (reuse == MAT_INITIAL_MATRIX) *newmat = C;
1439   PetscFunctionReturn(PETSC_SUCCESS);
1440 }
1441 
1442 #if defined(PETSC_HAVE_ELEMENTAL)
1443 PETSC_INTERN PetscErrorCode MatConvert_MPIDense_Elemental(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
1444 {
1445   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1446   Mat           mat_elemental;
1447   PetscScalar  *v;
1448   PetscInt      m = A->rmap->n, N = A->cmap->N, rstart = A->rmap->rstart, i, *rows, *cols, lda;
1449 
1450   PetscFunctionBegin;
1451   if (reuse == MAT_REUSE_MATRIX) {
1452     mat_elemental = *newmat;
1453     PetscCall(MatZeroEntries(*newmat));
1454   } else {
1455     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &mat_elemental));
1456     PetscCall(MatSetSizes(mat_elemental, PETSC_DECIDE, PETSC_DECIDE, A->rmap->N, A->cmap->N));
1457     PetscCall(MatSetType(mat_elemental, MATELEMENTAL));
1458     PetscCall(MatSetUp(mat_elemental));
1459     PetscCall(MatSetOption(mat_elemental, MAT_ROW_ORIENTED, PETSC_FALSE));
1460   }
1461 
1462   PetscCall(PetscMalloc2(m, &rows, N, &cols));
1463   for (i = 0; i < N; i++) cols[i] = i;
1464   for (i = 0; i < m; i++) rows[i] = rstart + i;
1465 
1466   /* PETSc-Elemental interface uses axpy for setting off-processor entries, only ADD_VALUES is allowed */
1467   PetscCall(MatDenseGetArray(A, &v));
1468   PetscCall(MatDenseGetLDA(a->A, &lda));
1469   if (lda == m) PetscCall(MatSetValues(mat_elemental, m, rows, N, cols, v, ADD_VALUES));
1470   else {
1471     for (i = 0; i < N; i++) PetscCall(MatSetValues(mat_elemental, m, rows, 1, &i, v + lda * i, ADD_VALUES));
1472   }
1473   PetscCall(MatAssemblyBegin(mat_elemental, MAT_FINAL_ASSEMBLY));
1474   PetscCall(MatAssemblyEnd(mat_elemental, MAT_FINAL_ASSEMBLY));
1475   PetscCall(MatDenseRestoreArray(A, &v));
1476   PetscCall(PetscFree2(rows, cols));
1477 
1478   if (reuse == MAT_INPLACE_MATRIX) {
1479     PetscCall(MatHeaderReplace(A, &mat_elemental));
1480   } else {
1481     *newmat = mat_elemental;
1482   }
1483   PetscFunctionReturn(PETSC_SUCCESS);
1484 }
1485 #endif
1486 
1487 static PetscErrorCode MatDenseGetColumn_MPIDense(Mat A, PetscInt col, PetscScalar **vals)
1488 {
1489   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
1490 
1491   PetscFunctionBegin;
1492   PetscCall(MatDenseGetColumn(mat->A, col, vals));
1493   PetscFunctionReturn(PETSC_SUCCESS);
1494 }
1495 
1496 static PetscErrorCode MatDenseRestoreColumn_MPIDense(Mat A, PetscScalar **vals)
1497 {
1498   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
1499 
1500   PetscFunctionBegin;
1501   PetscCall(MatDenseRestoreColumn(mat->A, vals));
1502   PetscFunctionReturn(PETSC_SUCCESS);
1503 }
1504 
1505 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIDense(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
1506 {
1507   Mat_MPIDense *mat;
1508   PetscInt      m, nloc, N;
1509 
1510   PetscFunctionBegin;
1511   PetscCall(MatGetSize(inmat, &m, &N));
1512   PetscCall(MatGetLocalSize(inmat, NULL, &nloc));
1513   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
1514     PetscInt sum;
1515 
1516     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
1517     /* Check sum(n) = N */
1518     PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
1519     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
1520 
1521     PetscCall(MatCreateDense(comm, m, n, PETSC_DETERMINE, N, NULL, outmat));
1522     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
1523   }
1524 
1525   /* numeric phase */
1526   mat = (Mat_MPIDense *)(*outmat)->data;
1527   PetscCall(MatCopy(inmat, mat->A, SAME_NONZERO_PATTERN));
1528   PetscFunctionReturn(PETSC_SUCCESS);
1529 }
1530 
1531 PetscErrorCode MatDenseGetColumnVec_MPIDense(Mat A, PetscInt col, Vec *v)
1532 {
1533   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1534   PetscInt      lda;
1535 
1536   PetscFunctionBegin;
1537   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1538   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1539   if (!a->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &a->cvec));
1540   a->vecinuse = col + 1;
1541   PetscCall(MatDenseGetLDA(a->A, &lda));
1542   PetscCall(MatDenseGetArray(a->A, (PetscScalar **)&a->ptrinuse));
1543   PetscCall(VecPlaceArray(a->cvec, PetscSafePointerPlusOffset(a->ptrinuse, (size_t)col * (size_t)lda)));
1544   *v = a->cvec;
1545   PetscFunctionReturn(PETSC_SUCCESS);
1546 }
1547 
1548 PetscErrorCode MatDenseRestoreColumnVec_MPIDense(Mat A, PetscInt col, Vec *v)
1549 {
1550   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1551 
1552   PetscFunctionBegin;
1553   PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1554   PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1555   VecCheckAssembled(a->cvec);
1556   a->vecinuse = 0;
1557   PetscCall(MatDenseRestoreArray(a->A, (PetscScalar **)&a->ptrinuse));
1558   PetscCall(VecResetArray(a->cvec));
1559   if (v) *v = NULL;
1560   PetscFunctionReturn(PETSC_SUCCESS);
1561 }
1562 
1563 PetscErrorCode MatDenseGetColumnVecRead_MPIDense(Mat A, PetscInt col, Vec *v)
1564 {
1565   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1566   PetscInt      lda;
1567 
1568   PetscFunctionBegin;
1569   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1570   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1571   if (!a->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &a->cvec));
1572   a->vecinuse = col + 1;
1573   PetscCall(MatDenseGetLDA(a->A, &lda));
1574   PetscCall(MatDenseGetArrayRead(a->A, &a->ptrinuse));
1575   PetscCall(VecPlaceArray(a->cvec, PetscSafePointerPlusOffset(a->ptrinuse, (size_t)col * (size_t)lda)));
1576   PetscCall(VecLockReadPush(a->cvec));
1577   *v = a->cvec;
1578   PetscFunctionReturn(PETSC_SUCCESS);
1579 }
1580 
1581 PetscErrorCode MatDenseRestoreColumnVecRead_MPIDense(Mat A, PetscInt col, Vec *v)
1582 {
1583   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1584 
1585   PetscFunctionBegin;
1586   PetscCheck(a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1587   PetscCheck(a->cvec, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing internal column vector");
1588   VecCheckAssembled(a->cvec);
1589   a->vecinuse = 0;
1590   PetscCall(MatDenseRestoreArrayRead(a->A, &a->ptrinuse));
1591   PetscCall(VecLockReadPop(a->cvec));
1592   PetscCall(VecResetArray(a->cvec));
1593   if (v) *v = NULL;
1594   PetscFunctionReturn(PETSC_SUCCESS);
1595 }
1596 
1597 PetscErrorCode MatDenseGetColumnVecWrite_MPIDense(Mat A, PetscInt col, Vec *v)
1598 {
1599   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1600   PetscInt      lda;
1601 
1602   PetscFunctionBegin;
1603   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1604   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1605   if (!a->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &a->cvec));
1606   a->vecinuse = col + 1;
1607   PetscCall(MatDenseGetLDA(a->A, &lda));
1608   PetscCall(MatDenseGetArrayWrite(a->A, (PetscScalar **)&a->ptrinuse));
1609   PetscCall(VecPlaceArray(a->cvec, PetscSafePointerPlusOffset(a->ptrinuse, (size_t)col * (size_t)lda)));
1610   *v = a->cvec;
1611   PetscFunctionReturn(PETSC_SUCCESS);
1612 }
1613 
1614 PetscErrorCode MatDenseRestoreColumnVecWrite_MPIDense(Mat A, PetscInt col, Vec *v)
1615 {
1616   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1617 
1618   PetscFunctionBegin;
1619   PetscCheck(a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1620   PetscCheck(a->cvec, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing internal column vector");
1621   VecCheckAssembled(a->cvec);
1622   a->vecinuse = 0;
1623   PetscCall(MatDenseRestoreArrayWrite(a->A, (PetscScalar **)&a->ptrinuse));
1624   PetscCall(VecResetArray(a->cvec));
1625   if (v) *v = NULL;
1626   PetscFunctionReturn(PETSC_SUCCESS);
1627 }
1628 
1629 static PetscErrorCode MatDenseGetSubMatrix_MPIDense(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1630 {
1631   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1632   Mat_MPIDense *c;
1633   MPI_Comm      comm;
1634   PetscInt      prbegin, prend, pcbegin, pcend;
1635 
1636   PetscFunctionBegin;
1637   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1638   PetscCheck(!a->vecinuse, comm, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1639   PetscCheck(!a->matinuse, comm, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1640   prbegin = PetscMax(0, PetscMin(A->rmap->rend, rbegin) - A->rmap->rstart);
1641   prend   = PetscMin(A->rmap->n, PetscMax(0, rend - A->rmap->rstart));
1642   pcbegin = PetscMax(0, PetscMin(A->cmap->rend, cbegin) - A->cmap->rstart);
1643   pcend   = PetscMin(A->cmap->n, PetscMax(0, cend - A->cmap->rstart));
1644   if (!a->cmat) {
1645     PetscCall(MatCreate(comm, &a->cmat));
1646     PetscCall(MatSetType(a->cmat, ((PetscObject)A)->type_name));
1647     if (rend - rbegin == A->rmap->N) PetscCall(PetscLayoutReference(A->rmap, &a->cmat->rmap));
1648     else {
1649       PetscCall(PetscLayoutSetLocalSize(a->cmat->rmap, prend - prbegin));
1650       PetscCall(PetscLayoutSetSize(a->cmat->rmap, rend - rbegin));
1651       PetscCall(PetscLayoutSetUp(a->cmat->rmap));
1652     }
1653     if (cend - cbegin == A->cmap->N) PetscCall(PetscLayoutReference(A->cmap, &a->cmat->cmap));
1654     else {
1655       PetscCall(PetscLayoutSetLocalSize(a->cmat->cmap, pcend - pcbegin));
1656       PetscCall(PetscLayoutSetSize(a->cmat->cmap, cend - cbegin));
1657       PetscCall(PetscLayoutSetUp(a->cmat->cmap));
1658     }
1659     c             = (Mat_MPIDense *)a->cmat->data;
1660     c->sub_rbegin = rbegin;
1661     c->sub_rend   = rend;
1662     c->sub_cbegin = cbegin;
1663     c->sub_cend   = cend;
1664   }
1665   c = (Mat_MPIDense *)a->cmat->data;
1666   if (c->sub_rbegin != rbegin || c->sub_rend != rend) {
1667     PetscCall(PetscLayoutDestroy(&a->cmat->rmap));
1668     PetscCall(PetscLayoutCreate(comm, &a->cmat->rmap));
1669     PetscCall(PetscLayoutSetLocalSize(a->cmat->rmap, prend - prbegin));
1670     PetscCall(PetscLayoutSetSize(a->cmat->rmap, rend - rbegin));
1671     PetscCall(PetscLayoutSetUp(a->cmat->rmap));
1672     c->sub_rbegin = rbegin;
1673     c->sub_rend   = rend;
1674   }
1675   if (c->sub_cbegin != cbegin || c->sub_cend != cend) {
1676     // special optimization: check if all columns are owned by rank 0, in which case no communication is necessary
1677     if ((cend - cbegin != a->cmat->cmap->N) || (A->cmap->range[1] != A->cmap->N)) {
1678       PetscCall(PetscLayoutDestroy(&a->cmat->cmap));
1679       PetscCall(PetscLayoutCreate(comm, &a->cmat->cmap));
1680       PetscCall(PetscLayoutSetLocalSize(a->cmat->cmap, pcend - pcbegin));
1681       PetscCall(PetscLayoutSetSize(a->cmat->cmap, cend - cbegin));
1682       PetscCall(PetscLayoutSetUp(a->cmat->cmap));
1683       PetscCall(VecDestroy(&c->lvec));
1684       PetscCall(PetscSFDestroy(&c->Mvctx));
1685     }
1686     c->sub_cbegin = cbegin;
1687     c->sub_cend   = cend;
1688   }
1689   PetscCheck(!c->A, comm, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1690   PetscCall(MatDenseGetSubMatrix(a->A, prbegin, prend, cbegin, cend, &c->A));
1691 
1692   a->cmat->preallocated = PETSC_TRUE;
1693   a->cmat->assembled    = PETSC_TRUE;
1694 #if defined(PETSC_HAVE_DEVICE)
1695   a->cmat->offloadmask = c->A->offloadmask;
1696 #endif
1697   a->matinuse = cbegin + 1;
1698   *v          = a->cmat;
1699   PetscFunctionReturn(PETSC_SUCCESS);
1700 }
1701 
1702 static PetscErrorCode MatDenseRestoreSubMatrix_MPIDense(Mat A, Mat *v)
1703 {
1704   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1705   Mat_MPIDense *c;
1706 
1707   PetscFunctionBegin;
1708   PetscCheck(a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1709   PetscCheck(a->cmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing internal matrix");
1710   PetscCheck(*v == a->cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1711   a->matinuse = 0;
1712   c           = (Mat_MPIDense *)a->cmat->data;
1713   PetscCall(MatDenseRestoreSubMatrix(a->A, &c->A));
1714   if (v) *v = NULL;
1715 #if defined(PETSC_HAVE_DEVICE)
1716   A->offloadmask = a->A->offloadmask;
1717 #endif
1718   PetscFunctionReturn(PETSC_SUCCESS);
1719 }
1720 
1721 /*MC
1722    MATMPIDENSE - MATMPIDENSE = "mpidense" - A matrix type to be used for distributed dense matrices.
1723 
1724    Options Database Key:
1725 . -mat_type mpidense - sets the matrix type to `MATMPIDENSE` during a call to `MatSetFromOptions()`
1726 
1727   Level: beginner
1728 
1729 .seealso: [](ch_matrices), `Mat`, `MatCreateDense()`, `MATSEQDENSE`, `MATDENSE`
1730 M*/
1731 PetscErrorCode MatCreate_MPIDense(Mat mat)
1732 {
1733   Mat_MPIDense *a;
1734 
1735   PetscFunctionBegin;
1736   PetscCall(PetscNew(&a));
1737   mat->data   = (void *)a;
1738   mat->ops[0] = MatOps_Values;
1739 
1740   mat->insertmode = NOT_SET_VALUES;
1741 
1742   /* build cache for off array entries formed */
1743   a->donotstash = PETSC_FALSE;
1744 
1745   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)mat), 1, &mat->stash));
1746 
1747   /* stuff used for matrix vector multiply */
1748   a->lvec        = NULL;
1749   a->Mvctx       = NULL;
1750   a->roworiented = PETSC_TRUE;
1751 
1752   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetLDA_C", MatDenseGetLDA_MPIDense));
1753   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseSetLDA_C", MatDenseSetLDA_MPIDense));
1754   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArray_C", MatDenseGetArray_MPIDense));
1755   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArray_C", MatDenseRestoreArray_MPIDense));
1756   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_MPIDense));
1757   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayRead_C", MatDenseRestoreArrayRead_MPIDense));
1758   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_MPIDense));
1759   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayWrite_C", MatDenseRestoreArrayWrite_MPIDense));
1760   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDensePlaceArray_C", MatDensePlaceArray_MPIDense));
1761   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseResetArray_C", MatDenseResetArray_MPIDense));
1762   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseReplaceArray_C", MatDenseReplaceArray_MPIDense));
1763   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_MPIDense));
1764   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_MPIDense));
1765   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_MPIDense));
1766   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_MPIDense));
1767   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_MPIDense));
1768   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_MPIDense));
1769   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_MPIDense));
1770   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_MPIDense));
1771   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
1772   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpiaij_C", MatConvert_MPIDense_MPIAIJ));
1773 #if defined(PETSC_HAVE_ELEMENTAL)
1774   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_elemental_C", MatConvert_MPIDense_Elemental));
1775 #endif
1776 #if defined(PETSC_HAVE_SCALAPACK)
1777   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_scalapack_C", MatConvert_Dense_ScaLAPACK));
1778 #endif
1779 #if defined(PETSC_HAVE_CUDA)
1780   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensecuda_C", MatConvert_MPIDense_MPIDenseCUDA));
1781 #endif
1782   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIDenseSetPreallocation_C", MatMPIDenseSetPreallocation_MPIDense));
1783   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidense_C", MatProductSetFromOptions_MPIAIJ_MPIDense));
1784   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaij_C", MatProductSetFromOptions_MPIDense_MPIAIJ));
1785 #if defined(PETSC_HAVE_CUDA)
1786   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijcusparse_mpidense_C", MatProductSetFromOptions_MPIAIJ_MPIDense));
1787   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijcusparse_C", MatProductSetFromOptions_MPIDense_MPIAIJ));
1788 #endif
1789 #if defined(PETSC_HAVE_HIP)
1790   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensehip_C", MatConvert_MPIDense_MPIDenseHIP));
1791   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijhipsparse_mpidense_C", MatProductSetFromOptions_MPIAIJ_MPIDense));
1792   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijhipsparse_C", MatProductSetFromOptions_MPIDense_MPIAIJ));
1793 #endif
1794   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumn_C", MatDenseGetColumn_MPIDense));
1795   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumn_C", MatDenseRestoreColumn_MPIDense));
1796   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultColumnRange_C", MatMultColumnRange_MPIDense));
1797   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultAddColumnRange_C", MatMultAddColumnRange_MPIDense));
1798   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_MPIDense));
1799   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_MPIDense));
1800   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATMPIDENSE));
1801   PetscFunctionReturn(PETSC_SUCCESS);
1802 }
1803 
1804 /*MC
1805    MATDENSE - MATDENSE = "dense" - A matrix type to be used for dense matrices.
1806 
1807    This matrix type is identical to `MATSEQDENSE` when constructed with a single process communicator,
1808    and `MATMPIDENSE` otherwise.
1809 
1810    Options Database Key:
1811 . -mat_type dense - sets the matrix type to `MATDENSE` during a call to `MatSetFromOptions()`
1812 
1813   Level: beginner
1814 
1815 .seealso: [](ch_matrices), `Mat`, `MATSEQDENSE`, `MATMPIDENSE`, `MATDENSECUDA`, `MATDENSEHIP`
1816 M*/
1817 
1818 /*@
1819   MatMPIDenseSetPreallocation - Sets the array used to store the matrix entries
1820 
1821   Collective
1822 
1823   Input Parameters:
1824 + B    - the matrix
1825 - data - optional location of matrix data.  Set to `NULL` for PETSc
1826          to control all matrix memory allocation.
1827 
1828   Level: intermediate
1829 
1830   Notes:
1831   The dense format is fully compatible with standard Fortran
1832   storage by columns.
1833 
1834   The data input variable is intended primarily for Fortran programmers
1835   who wish to allocate their own matrix memory space.  Most users should
1836   set `data` to `NULL`.
1837 
1838 .seealso: [](ch_matrices), `Mat`, `MATMPIDENSE`, `MatCreate()`, `MatCreateSeqDense()`, `MatSetValues()`
1839 @*/
1840 PetscErrorCode MatMPIDenseSetPreallocation(Mat B, PetscScalar *data)
1841 {
1842   PetscFunctionBegin;
1843   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
1844   PetscTryMethod(B, "MatMPIDenseSetPreallocation_C", (Mat, PetscScalar *), (B, data));
1845   PetscFunctionReturn(PETSC_SUCCESS);
1846 }
1847 
1848 /*@
1849   MatDensePlaceArray - Allows one to replace the array in a `MATDENSE` matrix with an
1850   array provided by the user. This is useful to avoid copying an array
1851   into a matrix
1852 
1853   Not Collective
1854 
1855   Input Parameters:
1856 + mat   - the matrix
1857 - array - the array in column major order
1858 
1859   Level: developer
1860 
1861   Note:
1862   Adding `const` to `array` was an oversight, see notes in `VecPlaceArray()`.
1863 
1864   You can return to the original array with a call to `MatDenseResetArray()`. The user is responsible for freeing this array; it will not be
1865   freed when the matrix is destroyed.
1866 
1867 .seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MatDenseGetArray()`, `MatDenseResetArray()`, `VecPlaceArray()`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecResetArray()`,
1868           `MatDenseReplaceArray()`
1869 @*/
1870 PetscErrorCode MatDensePlaceArray(Mat mat, const PetscScalar *array)
1871 {
1872   PetscFunctionBegin;
1873   PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
1874   PetscUseMethod(mat, "MatDensePlaceArray_C", (Mat, const PetscScalar *), (mat, array));
1875   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
1876 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
1877   mat->offloadmask = PETSC_OFFLOAD_CPU;
1878 #endif
1879   PetscFunctionReturn(PETSC_SUCCESS);
1880 }
1881 
1882 /*@
1883   MatDenseResetArray - Resets the matrix array to that it previously had before the call to `MatDensePlaceArray()`
1884 
1885   Not Collective
1886 
1887   Input Parameter:
1888 . mat - the matrix
1889 
1890   Level: developer
1891 
1892   Note:
1893   You can only call this after a call to `MatDensePlaceArray()`
1894 
1895 .seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MatDenseGetArray()`, `MatDensePlaceArray()`, `VecPlaceArray()`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecResetArray()`
1896 @*/
1897 PetscErrorCode MatDenseResetArray(Mat mat)
1898 {
1899   PetscFunctionBegin;
1900   PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
1901   PetscUseMethod(mat, "MatDenseResetArray_C", (Mat), (mat));
1902   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
1903   PetscFunctionReturn(PETSC_SUCCESS);
1904 }
1905 
1906 /*@
1907   MatDenseReplaceArray - Allows one to replace the array in a dense matrix with an
1908   array provided by the user. This is useful to avoid copying an array
1909   into a matrix
1910 
1911   Not Collective
1912 
1913   Input Parameters:
1914 + mat   - the matrix
1915 - array - the array in column major order
1916 
1917   Level: developer
1918 
1919   Note:
1920   Adding `const` to `array` was an oversight, see notes in `VecPlaceArray()`.
1921 
1922   The memory passed in MUST be obtained with `PetscMalloc()` and CANNOT be
1923   freed by the user. It will be freed when the matrix is destroyed.
1924 
1925 .seealso: [](ch_matrices), `Mat`, `MatDensePlaceArray()`, `MatDenseGetArray()`, `VecReplaceArray()`
1926 @*/
1927 PetscErrorCode MatDenseReplaceArray(Mat mat, const PetscScalar *array)
1928 {
1929   PetscFunctionBegin;
1930   PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
1931   PetscUseMethod(mat, "MatDenseReplaceArray_C", (Mat, const PetscScalar *), (mat, array));
1932   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
1933 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
1934   mat->offloadmask = PETSC_OFFLOAD_CPU;
1935 #endif
1936   PetscFunctionReturn(PETSC_SUCCESS);
1937 }
1938 
1939 /*@
1940   MatCreateDense - Creates a matrix in `MATDENSE` format.
1941 
1942   Collective
1943 
1944   Input Parameters:
1945 + comm - MPI communicator
1946 . m    - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1947 . n    - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1948 . M    - number of global rows (or `PETSC_DECIDE` to have calculated if `m` is given)
1949 . N    - number of global columns (or `PETSC_DECIDE` to have calculated if `n` is given)
1950 - data - optional location of matrix data.  Set data to `NULL` (`PETSC_NULL_SCALAR_ARRAY` for Fortran users) for PETSc
1951    to control all matrix memory allocation.
1952 
1953   Output Parameter:
1954 . A - the matrix
1955 
1956   Level: intermediate
1957 
1958   Notes:
1959   The dense format is fully compatible with standard Fortran
1960   storage by columns.
1961 
1962   Although local portions of the matrix are stored in column-major
1963   order, the matrix is partitioned across MPI ranks by row.
1964 
1965   The data input variable is intended primarily for Fortran programmers
1966   who wish to allocate their own matrix memory space.  Most users should
1967   set `data` to `NULL` (`PETSC_NULL_SCALAR_ARRAY` for Fortran users).
1968 
1969   The user MUST specify either the local or global matrix dimensions
1970   (possibly both).
1971 
1972 .seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MatCreate()`, `MatCreateSeqDense()`, `MatSetValues()`
1973 @*/
1974 PetscErrorCode MatCreateDense(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscScalar data[], Mat *A)
1975 {
1976   PetscFunctionBegin;
1977   PetscCall(MatCreate(comm, A));
1978   PetscCall(MatSetSizes(*A, m, n, M, N));
1979   PetscCall(MatSetType(*A, MATDENSE));
1980   PetscCall(MatSeqDenseSetPreallocation(*A, data));
1981   PetscCall(MatMPIDenseSetPreallocation(*A, data));
1982   PetscFunctionReturn(PETSC_SUCCESS);
1983 }
1984 
1985 static PetscErrorCode MatDuplicate_MPIDense(Mat A, MatDuplicateOption cpvalues, Mat *newmat)
1986 {
1987   Mat           mat;
1988   Mat_MPIDense *a, *oldmat = (Mat_MPIDense *)A->data;
1989 
1990   PetscFunctionBegin;
1991   *newmat = NULL;
1992   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &mat));
1993   PetscCall(MatSetSizes(mat, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
1994   PetscCall(MatSetType(mat, ((PetscObject)A)->type_name));
1995   a = (Mat_MPIDense *)mat->data;
1996 
1997   mat->factortype   = A->factortype;
1998   mat->assembled    = PETSC_TRUE;
1999   mat->preallocated = PETSC_TRUE;
2000 
2001   mat->insertmode = NOT_SET_VALUES;
2002   a->donotstash   = oldmat->donotstash;
2003 
2004   PetscCall(PetscLayoutReference(A->rmap, &mat->rmap));
2005   PetscCall(PetscLayoutReference(A->cmap, &mat->cmap));
2006 
2007   PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
2008 
2009   *newmat = mat;
2010   PetscFunctionReturn(PETSC_SUCCESS);
2011 }
2012 
2013 static PetscErrorCode MatLoad_MPIDense(Mat newMat, PetscViewer viewer)
2014 {
2015   PetscBool isbinary;
2016 #if defined(PETSC_HAVE_HDF5)
2017   PetscBool ishdf5;
2018 #endif
2019 
2020   PetscFunctionBegin;
2021   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
2022   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
2023   /* force binary viewer to load .info file if it has not yet done so */
2024   PetscCall(PetscViewerSetUp(viewer));
2025   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
2026 #if defined(PETSC_HAVE_HDF5)
2027   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
2028 #endif
2029   if (isbinary) {
2030     PetscCall(MatLoad_Dense_Binary(newMat, viewer));
2031 #if defined(PETSC_HAVE_HDF5)
2032   } else if (ishdf5) {
2033     PetscCall(MatLoad_Dense_HDF5(newMat, viewer));
2034 #endif
2035   } else SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
2036   PetscFunctionReturn(PETSC_SUCCESS);
2037 }
2038 
2039 static PetscErrorCode MatEqual_MPIDense(Mat A, Mat B, PetscBool *flag)
2040 {
2041   Mat_MPIDense *matB = (Mat_MPIDense *)B->data, *matA = (Mat_MPIDense *)A->data;
2042   Mat           a, b;
2043 
2044   PetscFunctionBegin;
2045   a = matA->A;
2046   b = matB->A;
2047   PetscCall(MatEqual(a, b, flag));
2048   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, flag, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2049   PetscFunctionReturn(PETSC_SUCCESS);
2050 }
2051 
2052 static PetscErrorCode MatDestroy_MatTransMatMult_MPIDense_MPIDense(void *data)
2053 {
2054   Mat_TransMatMultDense *atb = (Mat_TransMatMultDense *)data;
2055 
2056   PetscFunctionBegin;
2057   PetscCall(PetscFree2(atb->sendbuf, atb->recvcounts));
2058   PetscCall(MatDestroy(&atb->atb));
2059   PetscCall(PetscFree(atb));
2060   PetscFunctionReturn(PETSC_SUCCESS);
2061 }
2062 
2063 static PetscErrorCode MatDestroy_MatMatTransMult_MPIDense_MPIDense(void *data)
2064 {
2065   Mat_MatTransMultDense *abt = (Mat_MatTransMultDense *)data;
2066 
2067   PetscFunctionBegin;
2068   PetscCall(PetscFree2(abt->buf[0], abt->buf[1]));
2069   PetscCall(PetscFree2(abt->recvcounts, abt->recvdispls));
2070   PetscCall(PetscFree(abt));
2071   PetscFunctionReturn(PETSC_SUCCESS);
2072 }
2073 
2074 static PetscErrorCode MatTransposeMatMultNumeric_MPIDense_MPIDense(Mat A, Mat B, Mat C)
2075 {
2076   Mat_MPIDense          *a = (Mat_MPIDense *)A->data, *b = (Mat_MPIDense *)B->data, *c = (Mat_MPIDense *)C->data;
2077   Mat_TransMatMultDense *atb;
2078   MPI_Comm               comm;
2079   PetscMPIInt            size, *recvcounts;
2080   PetscScalar           *carray, *sendbuf;
2081   const PetscScalar     *atbarray;
2082   PetscInt               i, cN = C->cmap->N, proc, k, j, lda;
2083   const PetscInt        *ranges;
2084 
2085   PetscFunctionBegin;
2086   MatCheckProduct(C, 3);
2087   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
2088   atb        = (Mat_TransMatMultDense *)C->product->data;
2089   recvcounts = atb->recvcounts;
2090   sendbuf    = atb->sendbuf;
2091 
2092   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2093   PetscCallMPI(MPI_Comm_size(comm, &size));
2094 
2095   /* compute atbarray = aseq^T * bseq */
2096   PetscCall(MatTransposeMatMult(a->A, b->A, atb->atb ? MAT_REUSE_MATRIX : MAT_INITIAL_MATRIX, PETSC_DETERMINE, &atb->atb));
2097 
2098   PetscCall(MatGetOwnershipRanges(C, &ranges));
2099 
2100   if (ranges[1] == C->rmap->N) {
2101     /* all of the values are being reduced to rank 0: optimize this case to use MPI_Reduce and GPU aware MPI if available */
2102     PetscInt           atb_lda, c_lda;
2103     Mat                atb_local = atb->atb;
2104     Mat                atb_alloc = NULL;
2105     Mat                c_local   = c->A;
2106     Mat                c_alloc   = NULL;
2107     PetscMemType       atb_memtype, c_memtype;
2108     const PetscScalar *atb_array = NULL;
2109     MPI_Datatype       vector_type;
2110     PetscScalar       *c_array = NULL;
2111     PetscMPIInt        rank;
2112 
2113     PetscCallMPI(MPI_Comm_rank(comm, &rank));
2114 
2115     PetscCall(MatDenseGetLDA(atb_local, &atb_lda));
2116     if (atb_lda != C->rmap->N) {
2117       // copy atb to a matrix that will have lda == the number of rows
2118       PetscCall(MatDuplicate(atb_local, MAT_DO_NOT_COPY_VALUES, &atb_alloc));
2119       PetscCall(MatCopy(atb_local, atb_alloc, DIFFERENT_NONZERO_PATTERN));
2120       atb_local = atb_alloc;
2121     }
2122 
2123     if (rank == 0) {
2124       PetscCall(MatDenseGetLDA(c_local, &c_lda));
2125       if (c_lda != C->rmap->N) {
2126         // copy c to a matrix that will have lda == the number of rows
2127         PetscCall(MatDuplicate(c_local, MAT_DO_NOT_COPY_VALUES, &c_alloc));
2128         c_local = c_alloc;
2129       }
2130       PetscCall(MatZeroEntries(c_local));
2131     }
2132     /* atb_local and c_local have nrows = lda = A->cmap->N and ncols =
2133      * B->cmap->N: use the a->Mvctx to use the best reduction method */
2134     if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
2135     vector_type = MPIU_SCALAR;
2136     if (B->cmap->N > 1) {
2137       PetscMPIInt mpi_N;
2138 
2139       PetscCall(PetscMPIIntCast(B->cmap->N, &mpi_N));
2140       PetscCallMPI(MPI_Type_contiguous(mpi_N, MPIU_SCALAR, &vector_type));
2141       PetscCallMPI(MPI_Type_commit(&vector_type));
2142     }
2143     PetscCall(MatDenseGetArrayReadAndMemType(atb_local, &atb_array, &atb_memtype));
2144     PetscCall(MatDenseGetArrayWriteAndMemType(c_local, &c_array, &c_memtype));
2145     PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, vector_type, atb_memtype, atb_array, c_memtype, c_array, MPIU_SUM));
2146     PetscCall(PetscSFReduceEnd(a->Mvctx, vector_type, atb_array, c_array, MPIU_SUM));
2147     PetscCall(MatDenseRestoreArrayWriteAndMemType(c_local, &c_array));
2148     PetscCall(MatDenseRestoreArrayReadAndMemType(atb_local, &atb_array));
2149     if (rank == 0 && c_local != c->A) PetscCall(MatCopy(c_local, c->A, DIFFERENT_NONZERO_PATTERN));
2150     if (B->cmap->N > 1) PetscCallMPI(MPI_Type_free(&vector_type));
2151     PetscCall(MatDestroy(&atb_alloc));
2152     PetscCall(MatDestroy(&c_alloc));
2153     PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
2154     PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2155     PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
2156     PetscFunctionReturn(PETSC_SUCCESS);
2157   }
2158 
2159   /* arrange atbarray into sendbuf */
2160   PetscCall(MatDenseGetArrayRead(atb->atb, &atbarray));
2161   PetscCall(MatDenseGetLDA(atb->atb, &lda));
2162   for (proc = 0, k = 0; proc < size; proc++) {
2163     for (j = 0; j < cN; j++) {
2164       for (i = ranges[proc]; i < ranges[proc + 1]; i++) sendbuf[k++] = atbarray[i + j * lda];
2165     }
2166   }
2167   PetscCall(MatDenseRestoreArrayRead(atb->atb, &atbarray));
2168 
2169   /* sum all atbarray to local values of C */
2170   PetscCall(MatDenseGetArrayWrite(c->A, &carray));
2171   PetscCallMPI(MPI_Reduce_scatter(sendbuf, carray, recvcounts, MPIU_SCALAR, MPIU_SUM, comm));
2172   PetscCall(MatDenseRestoreArrayWrite(c->A, &carray));
2173   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
2174   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2175   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
2176   PetscFunctionReturn(PETSC_SUCCESS);
2177 }
2178 
2179 static PetscErrorCode MatTransposeMatMultSymbolic_MPIDense_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
2180 {
2181   MPI_Comm               comm;
2182   PetscMPIInt            size;
2183   PetscInt               cm = A->cmap->n, cM, cN = B->cmap->N;
2184   Mat_TransMatMultDense *atb;
2185   PetscBool              cisdense = PETSC_FALSE;
2186   const PetscInt        *ranges;
2187 
2188   PetscFunctionBegin;
2189   MatCheckProduct(C, 4);
2190   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
2191   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2192   PetscCheck(A->rmap->rstart == B->rmap->rstart && A->rmap->rend == B->rmap->rend, comm, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != B (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->rstart,
2193              A->rmap->rend, B->rmap->rstart, B->rmap->rend);
2194 
2195   /* create matrix product C */
2196   PetscCall(MatSetSizes(C, cm, B->cmap->n, A->cmap->N, B->cmap->N));
2197 #if defined(PETSC_HAVE_CUDA)
2198   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, ""));
2199 #endif
2200 #if defined(PETSC_HAVE_HIP)
2201   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSEHIP, ""));
2202 #endif
2203   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
2204   PetscCall(MatSetUp(C));
2205 
2206   /* create data structure for reuse C */
2207   PetscCallMPI(MPI_Comm_size(comm, &size));
2208   PetscCall(PetscNew(&atb));
2209   cM = C->rmap->N;
2210   PetscCall(PetscMalloc2(cM * cN, &atb->sendbuf, size, &atb->recvcounts));
2211   PetscCall(MatGetOwnershipRanges(C, &ranges));
2212   for (PetscMPIInt i = 0; i < size; i++) PetscCall(PetscMPIIntCast((ranges[i + 1] - ranges[i]) * cN, &atb->recvcounts[i]));
2213   C->product->data    = atb;
2214   C->product->destroy = MatDestroy_MatTransMatMult_MPIDense_MPIDense;
2215   PetscFunctionReturn(PETSC_SUCCESS);
2216 }
2217 
2218 static PetscErrorCode MatMatTransposeMultSymbolic_MPIDense_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
2219 {
2220   MPI_Comm               comm;
2221   PetscMPIInt            i, size;
2222   PetscInt               maxRows, bufsiz;
2223   PetscMPIInt            tag;
2224   PetscInt               alg;
2225   Mat_MatTransMultDense *abt;
2226   Mat_Product           *product = C->product;
2227   PetscBool              flg;
2228 
2229   PetscFunctionBegin;
2230   MatCheckProduct(C, 4);
2231   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
2232   /* check local size of A and B */
2233   PetscCheck(A->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local column dimensions are incompatible, A (%" PetscInt_FMT ") != B (%" PetscInt_FMT ")", A->cmap->n, B->cmap->n);
2234 
2235   PetscCall(PetscStrcmp(product->alg, "allgatherv", &flg));
2236   alg = flg ? 0 : 1;
2237 
2238   /* setup matrix product C */
2239   PetscCall(MatSetSizes(C, A->rmap->n, B->rmap->n, A->rmap->N, B->rmap->N));
2240   PetscCall(MatSetType(C, MATMPIDENSE));
2241   PetscCall(MatSetUp(C));
2242   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag));
2243 
2244   /* create data structure for reuse C */
2245   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2246   PetscCallMPI(MPI_Comm_size(comm, &size));
2247   PetscCall(PetscNew(&abt));
2248   abt->tag = tag;
2249   abt->alg = alg;
2250   switch (alg) {
2251   case 1: /* alg: "cyclic" */
2252     for (maxRows = 0, i = 0; i < size; i++) maxRows = PetscMax(maxRows, B->rmap->range[i + 1] - B->rmap->range[i]);
2253     bufsiz = A->cmap->N * maxRows;
2254     PetscCall(PetscMalloc2(bufsiz, &abt->buf[0], bufsiz, &abt->buf[1]));
2255     break;
2256   default: /* alg: "allgatherv" */
2257     PetscCall(PetscMalloc2(B->rmap->n * B->cmap->N, &abt->buf[0], B->rmap->N * B->cmap->N, &abt->buf[1]));
2258     PetscCall(PetscMalloc2(size, &abt->recvcounts, size + 1, &abt->recvdispls));
2259     for (i = 0; i <= size; i++) PetscCall(PetscMPIIntCast(B->rmap->range[i] * A->cmap->N, &abt->recvdispls[i]));
2260     for (i = 0; i < size; i++) PetscCall(PetscMPIIntCast(abt->recvdispls[i + 1] - abt->recvdispls[i], &abt->recvcounts[i]));
2261     break;
2262   }
2263 
2264   C->product->data                = abt;
2265   C->product->destroy             = MatDestroy_MatMatTransMult_MPIDense_MPIDense;
2266   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_MPIDense_MPIDense;
2267   PetscFunctionReturn(PETSC_SUCCESS);
2268 }
2269 
2270 static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense_Cyclic(Mat A, Mat B, Mat C)
2271 {
2272   Mat_MPIDense          *a = (Mat_MPIDense *)A->data, *b = (Mat_MPIDense *)B->data, *c = (Mat_MPIDense *)C->data;
2273   Mat_MatTransMultDense *abt;
2274   MPI_Comm               comm;
2275   PetscMPIInt            rank, size, sendto, recvfrom, recvisfrom;
2276   PetscScalar           *sendbuf, *recvbuf = NULL, *cv;
2277   PetscInt               i, cK             = A->cmap->N, sendsiz, recvsiz, k, j, bn;
2278   PetscScalar            _DOne = 1.0, _DZero = 0.0;
2279   const PetscScalar     *av, *bv;
2280   PetscBLASInt           cm, cn, ck, alda, blda = 0, clda;
2281   MPI_Request            reqs[2];
2282   const PetscInt        *ranges;
2283 
2284   PetscFunctionBegin;
2285   MatCheckProduct(C, 3);
2286   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
2287   abt = (Mat_MatTransMultDense *)C->product->data;
2288   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2289   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2290   PetscCallMPI(MPI_Comm_size(comm, &size));
2291   PetscCall(MatDenseGetArrayRead(a->A, &av));
2292   PetscCall(MatDenseGetArrayRead(b->A, &bv));
2293   PetscCall(MatDenseGetArrayWrite(c->A, &cv));
2294   PetscCall(MatDenseGetLDA(a->A, &i));
2295   PetscCall(PetscBLASIntCast(i, &alda));
2296   PetscCall(MatDenseGetLDA(b->A, &i));
2297   PetscCall(PetscBLASIntCast(i, &blda));
2298   PetscCall(MatDenseGetLDA(c->A, &i));
2299   PetscCall(PetscBLASIntCast(i, &clda));
2300   PetscCall(MatGetOwnershipRanges(B, &ranges));
2301   bn = B->rmap->n;
2302   if (blda == bn) {
2303     sendbuf = (PetscScalar *)bv;
2304   } else {
2305     sendbuf = abt->buf[0];
2306     for (k = 0, i = 0; i < cK; i++) {
2307       for (j = 0; j < bn; j++, k++) sendbuf[k] = bv[i * blda + j];
2308     }
2309   }
2310   if (size > 1) {
2311     sendto   = (rank + size - 1) % size;
2312     recvfrom = (rank + size + 1) % size;
2313   } else {
2314     sendto = recvfrom = 0;
2315   }
2316   PetscCall(PetscBLASIntCast(cK, &ck));
2317   PetscCall(PetscBLASIntCast(c->A->rmap->n, &cm));
2318   recvisfrom = rank;
2319   for (i = 0; i < size; i++) {
2320     /* we have finished receiving in sending, bufs can be read/modified */
2321     PetscMPIInt nextrecvisfrom = (recvisfrom + 1) % size; /* which process the next recvbuf will originate on */
2322     PetscInt    nextbn         = ranges[nextrecvisfrom + 1] - ranges[nextrecvisfrom];
2323 
2324     if (nextrecvisfrom != rank) {
2325       /* start the cyclic sends from sendbuf, to recvbuf (which will switch to sendbuf) */
2326       sendsiz = cK * bn;
2327       recvsiz = cK * nextbn;
2328       recvbuf = (i & 1) ? abt->buf[0] : abt->buf[1];
2329       PetscCallMPI(MPIU_Isend(sendbuf, sendsiz, MPIU_SCALAR, sendto, abt->tag, comm, &reqs[0]));
2330       PetscCallMPI(MPIU_Irecv(recvbuf, recvsiz, MPIU_SCALAR, recvfrom, abt->tag, comm, &reqs[1]));
2331     }
2332 
2333     /* local aseq * sendbuf^T */
2334     PetscCall(PetscBLASIntCast(ranges[recvisfrom + 1] - ranges[recvisfrom], &cn));
2335     if (cm && cn && ck) PetscCallBLAS("BLASgemm", BLASgemm_("N", "T", &cm, &cn, &ck, &_DOne, av, &alda, sendbuf, &cn, &_DZero, cv + clda * ranges[recvisfrom], &clda));
2336 
2337     if (nextrecvisfrom != rank) {
2338       /* wait for the sends and receives to complete, swap sendbuf and recvbuf */
2339       PetscCallMPI(MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE));
2340     }
2341     bn         = nextbn;
2342     recvisfrom = nextrecvisfrom;
2343     sendbuf    = recvbuf;
2344   }
2345   PetscCall(MatDenseRestoreArrayRead(a->A, &av));
2346   PetscCall(MatDenseRestoreArrayRead(b->A, &bv));
2347   PetscCall(MatDenseRestoreArrayWrite(c->A, &cv));
2348   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
2349   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2350   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
2351   PetscFunctionReturn(PETSC_SUCCESS);
2352 }
2353 
2354 static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense_Allgatherv(Mat A, Mat B, Mat C)
2355 {
2356   Mat_MPIDense          *a = (Mat_MPIDense *)A->data, *b = (Mat_MPIDense *)B->data, *c = (Mat_MPIDense *)C->data;
2357   Mat_MatTransMultDense *abt;
2358   MPI_Comm               comm;
2359   PetscMPIInt            size, ibn;
2360   PetscScalar           *cv, *sendbuf, *recvbuf;
2361   const PetscScalar     *av, *bv;
2362   PetscInt               blda, i, cK = A->cmap->N, k, j, bn;
2363   PetscScalar            _DOne = 1.0, _DZero = 0.0;
2364   PetscBLASInt           cm, cn, ck, alda, clda;
2365 
2366   PetscFunctionBegin;
2367   MatCheckProduct(C, 3);
2368   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
2369   abt = (Mat_MatTransMultDense *)C->product->data;
2370   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2371   PetscCallMPI(MPI_Comm_size(comm, &size));
2372   PetscCall(MatDenseGetArrayRead(a->A, &av));
2373   PetscCall(MatDenseGetArrayRead(b->A, &bv));
2374   PetscCall(MatDenseGetArrayWrite(c->A, &cv));
2375   PetscCall(MatDenseGetLDA(a->A, &i));
2376   PetscCall(PetscBLASIntCast(i, &alda));
2377   PetscCall(MatDenseGetLDA(b->A, &blda));
2378   PetscCall(MatDenseGetLDA(c->A, &i));
2379   PetscCall(PetscBLASIntCast(i, &clda));
2380   /* copy transpose of B into buf[0] */
2381   bn      = B->rmap->n;
2382   sendbuf = abt->buf[0];
2383   recvbuf = abt->buf[1];
2384   for (k = 0, j = 0; j < bn; j++) {
2385     for (i = 0; i < cK; i++, k++) sendbuf[k] = bv[i * blda + j];
2386   }
2387   PetscCall(MatDenseRestoreArrayRead(b->A, &bv));
2388   PetscCall(PetscMPIIntCast(bn * cK, &ibn));
2389   PetscCallMPI(MPI_Allgatherv(sendbuf, ibn, MPIU_SCALAR, recvbuf, abt->recvcounts, abt->recvdispls, MPIU_SCALAR, comm));
2390   PetscCall(PetscBLASIntCast(cK, &ck));
2391   PetscCall(PetscBLASIntCast(c->A->rmap->n, &cm));
2392   PetscCall(PetscBLASIntCast(c->A->cmap->n, &cn));
2393   if (cm && cn && ck) PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &cm, &cn, &ck, &_DOne, av, &alda, recvbuf, &ck, &_DZero, cv, &clda));
2394   PetscCall(MatDenseRestoreArrayRead(a->A, &av));
2395   PetscCall(MatDenseRestoreArrayRead(b->A, &bv));
2396   PetscCall(MatDenseRestoreArrayWrite(c->A, &cv));
2397   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
2398   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2399   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
2400   PetscFunctionReturn(PETSC_SUCCESS);
2401 }
2402 
2403 static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense(Mat A, Mat B, Mat C)
2404 {
2405   Mat_MatTransMultDense *abt;
2406 
2407   PetscFunctionBegin;
2408   MatCheckProduct(C, 3);
2409   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
2410   abt = (Mat_MatTransMultDense *)C->product->data;
2411   switch (abt->alg) {
2412   case 1:
2413     PetscCall(MatMatTransposeMultNumeric_MPIDense_MPIDense_Cyclic(A, B, C));
2414     break;
2415   default:
2416     PetscCall(MatMatTransposeMultNumeric_MPIDense_MPIDense_Allgatherv(A, B, C));
2417     break;
2418   }
2419   PetscFunctionReturn(PETSC_SUCCESS);
2420 }
2421 
2422 static PetscErrorCode MatDestroy_MatMatMult_MPIDense_MPIDense(void *data)
2423 {
2424   Mat_MatMultDense *ab = (Mat_MatMultDense *)data;
2425 
2426   PetscFunctionBegin;
2427   PetscCall(MatDestroy(&ab->Ce));
2428   PetscCall(MatDestroy(&ab->Ae));
2429   PetscCall(MatDestroy(&ab->Be));
2430   PetscCall(PetscFree(ab));
2431   PetscFunctionReturn(PETSC_SUCCESS);
2432 }
2433 
2434 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIDense(Mat A, Mat B, Mat C)
2435 {
2436   Mat_MatMultDense *ab;
2437   Mat_MPIDense     *mdn = (Mat_MPIDense *)A->data;
2438   Mat_MPIDense     *b   = (Mat_MPIDense *)B->data;
2439 
2440   PetscFunctionBegin;
2441   MatCheckProduct(C, 3);
2442   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Missing product data");
2443   ab = (Mat_MatMultDense *)C->product->data;
2444   if (ab->Ae && ab->Ce) {
2445 #if PetscDefined(HAVE_ELEMENTAL)
2446     PetscCall(MatConvert_MPIDense_Elemental(A, MATELEMENTAL, MAT_REUSE_MATRIX, &ab->Ae));
2447     PetscCall(MatConvert_MPIDense_Elemental(B, MATELEMENTAL, MAT_REUSE_MATRIX, &ab->Be));
2448     PetscCall(MatMatMultNumeric_Elemental(ab->Ae, ab->Be, ab->Ce));
2449     PetscCall(MatConvert(ab->Ce, MATMPIDENSE, MAT_REUSE_MATRIX, &C));
2450 #else
2451     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "PETSC_HAVE_ELEMENTAL not defined");
2452 #endif
2453   } else {
2454     MPI_Comm           comm;
2455     const PetscScalar *read;
2456     PetscScalar       *write;
2457     PetscInt           lda;
2458     const PetscInt    *ranges;
2459     PetscMPIInt        size;
2460 
2461     if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A)); /* cannot be done during the symbolic phase because of possible calls to MatProductReplaceMats() */
2462     comm = PetscObjectComm((PetscObject)B);
2463     PetscCallMPI(MPI_Comm_size(comm, &size));
2464     PetscCall(PetscLayoutGetRanges(B->rmap, &ranges));
2465     if (ranges[1] == ranges[size]) {
2466       // optimize for the case where the B matrix is broadcast from rank 0
2467       PetscInt           b_lda, be_lda;
2468       Mat                b_local  = b->A;
2469       Mat                b_alloc  = NULL;
2470       Mat                be_local = ab->Be;
2471       Mat                be_alloc = NULL;
2472       PetscMemType       b_memtype, be_memtype;
2473       const PetscScalar *b_array = NULL;
2474       MPI_Datatype       vector_type;
2475       PetscScalar       *be_array = NULL;
2476       PetscMPIInt        rank;
2477 
2478       PetscCallMPI(MPI_Comm_rank(comm, &rank));
2479       PetscCall(MatDenseGetLDA(be_local, &be_lda));
2480       if (be_lda != B->rmap->N) {
2481         PetscCall(MatDuplicate(be_local, MAT_DO_NOT_COPY_VALUES, &be_alloc));
2482         be_local = be_alloc;
2483       }
2484 
2485       if (rank == 0) {
2486         PetscCall(MatDenseGetLDA(b_local, &b_lda));
2487         if (b_lda != B->rmap->N) {
2488           PetscCall(MatDuplicate(b_local, MAT_DO_NOT_COPY_VALUES, &b_alloc));
2489           PetscCall(MatCopy(b_local, b_alloc, DIFFERENT_NONZERO_PATTERN));
2490           b_local = b_alloc;
2491         }
2492       }
2493       vector_type = MPIU_SCALAR;
2494       if (B->cmap->N > 1) {
2495         PetscMPIInt mpi_N;
2496 
2497         PetscCall(PetscMPIIntCast(B->cmap->N, &mpi_N));
2498         PetscCallMPI(MPI_Type_contiguous(mpi_N, MPIU_SCALAR, &vector_type));
2499         PetscCallMPI(MPI_Type_commit(&vector_type));
2500       }
2501       PetscCall(MatDenseGetArrayReadAndMemType(b_local, &b_array, &b_memtype));
2502       PetscCall(MatDenseGetArrayWriteAndMemType(be_local, &be_array, &be_memtype));
2503       PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, vector_type, b_memtype, b_array, be_memtype, be_array, MPI_REPLACE));
2504       PetscCall(PetscSFBcastEnd(mdn->Mvctx, vector_type, b_array, be_array, MPI_REPLACE));
2505       PetscCall(MatDenseRestoreArrayWriteAndMemType(be_local, &be_array));
2506       PetscCall(MatDenseRestoreArrayReadAndMemType(b_local, &b_array));
2507       if (be_local != ab->Be) PetscCall(MatCopy(be_local, ab->Be, DIFFERENT_NONZERO_PATTERN));
2508       if (B->cmap->N > 1) PetscCallMPI(MPI_Type_free(&vector_type));
2509       PetscCall(MatDestroy(&be_alloc));
2510       PetscCall(MatDestroy(&b_alloc));
2511     } else {
2512       PetscCall(MatDenseGetLDA(B, &lda));
2513       PetscCall(MatDenseGetArrayRead(B, &read));
2514       PetscCall(MatDenseGetArrayWrite(ab->Be, &write));
2515       for (PetscInt i = 0; i < C->cmap->N; ++i) {
2516         PetscCall(PetscSFBcastBegin(mdn->Mvctx, MPIU_SCALAR, read + i * lda, write + i * ab->Be->rmap->n, MPI_REPLACE));
2517         PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, read + i * lda, write + i * ab->Be->rmap->n, MPI_REPLACE));
2518       }
2519       PetscCall(MatDenseRestoreArrayWrite(ab->Be, &write));
2520       PetscCall(MatDenseRestoreArrayRead(B, &read));
2521     }
2522     PetscCall(MatMatMultNumeric_SeqDense_SeqDense(((Mat_MPIDense *)A->data)->A, ab->Be, ((Mat_MPIDense *)C->data)->A));
2523   }
2524   PetscFunctionReturn(PETSC_SUCCESS);
2525 }
2526 
2527 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
2528 {
2529   Mat_Product      *product = C->product;
2530   PetscInt          alg;
2531   Mat_MatMultDense *ab;
2532   PetscBool         flg;
2533 
2534   PetscFunctionBegin;
2535   MatCheckProduct(C, 4);
2536   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
2537   /* check local size of A and B */
2538   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != B (%" PetscInt_FMT ", %" PetscInt_FMT ")",
2539              A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend);
2540 
2541   PetscCall(PetscStrcmp(product->alg, "petsc", &flg));
2542   alg = flg ? 0 : 1;
2543 
2544   /* setup C */
2545   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
2546   PetscCall(MatSetType(C, MATMPIDENSE));
2547   PetscCall(MatSetUp(C));
2548 
2549   /* create data structure for reuse Cdense */
2550   PetscCall(PetscNew(&ab));
2551 
2552   switch (alg) {
2553   case 1: /* alg: "elemental" */
2554 #if PetscDefined(HAVE_ELEMENTAL)
2555     /* create elemental matrices Ae and Be */
2556     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &ab->Ae));
2557     PetscCall(MatSetSizes(ab->Ae, PETSC_DECIDE, PETSC_DECIDE, A->rmap->N, A->cmap->N));
2558     PetscCall(MatSetType(ab->Ae, MATELEMENTAL));
2559     PetscCall(MatSetUp(ab->Ae));
2560     PetscCall(MatSetOption(ab->Ae, MAT_ROW_ORIENTED, PETSC_FALSE));
2561 
2562     PetscCall(MatCreate(PetscObjectComm((PetscObject)B), &ab->Be));
2563     PetscCall(MatSetSizes(ab->Be, PETSC_DECIDE, PETSC_DECIDE, B->rmap->N, B->cmap->N));
2564     PetscCall(MatSetType(ab->Be, MATELEMENTAL));
2565     PetscCall(MatSetUp(ab->Be));
2566     PetscCall(MatSetOption(ab->Be, MAT_ROW_ORIENTED, PETSC_FALSE));
2567 
2568     /* compute symbolic Ce = Ae*Be */
2569     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &ab->Ce));
2570     PetscCall(MatMatMultSymbolic_Elemental(ab->Ae, ab->Be, fill, ab->Ce));
2571 #else
2572     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "PETSC_HAVE_ELEMENTAL not defined");
2573 #endif
2574     break;
2575   default: /* alg: "petsc" */
2576     ab->Ae = NULL;
2577     PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, A->cmap->N, B->cmap->N, NULL, &ab->Be));
2578     ab->Ce = NULL;
2579     break;
2580   }
2581 
2582   C->product->data       = ab;
2583   C->product->destroy    = MatDestroy_MatMatMult_MPIDense_MPIDense;
2584   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIDense;
2585   PetscFunctionReturn(PETSC_SUCCESS);
2586 }
2587 
2588 static PetscErrorCode MatProductSetFromOptions_MPIDense_AB(Mat C)
2589 {
2590   Mat_Product *product     = C->product;
2591   const char  *algTypes[2] = {"petsc", "elemental"};
2592   PetscInt     alg, nalg = PetscDefined(HAVE_ELEMENTAL) ? 2 : 1;
2593   PetscBool    flg = PETSC_FALSE;
2594 
2595   PetscFunctionBegin;
2596   /* Set default algorithm */
2597   alg = 0; /* default is PETSc */
2598   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2599   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2600 
2601   /* Get runtime option */
2602   PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
2603   PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[alg], &alg, &flg));
2604   PetscOptionsEnd();
2605   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2606 
2607   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIDense;
2608   C->ops->productsymbolic = MatProductSymbolic_AB;
2609   PetscFunctionReturn(PETSC_SUCCESS);
2610 }
2611 
2612 static PetscErrorCode MatProductSetFromOptions_MPIDense_AtB(Mat C)
2613 {
2614   Mat_Product *product = C->product;
2615   Mat          A = product->A, B = product->B;
2616 
2617   PetscFunctionBegin;
2618   PetscCheck(A->rmap->rstart == B->rmap->rstart && A->rmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
2619              A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend);
2620   C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_MPIDense_MPIDense;
2621   C->ops->productsymbolic          = MatProductSymbolic_AtB;
2622   PetscFunctionReturn(PETSC_SUCCESS);
2623 }
2624 
2625 static PetscErrorCode MatProductSetFromOptions_MPIDense_ABt(Mat C)
2626 {
2627   Mat_Product *product     = C->product;
2628   const char  *algTypes[2] = {"allgatherv", "cyclic"};
2629   PetscInt     alg, nalg = 2;
2630   PetscBool    flg = PETSC_FALSE;
2631 
2632   PetscFunctionBegin;
2633   /* Set default algorithm */
2634   alg = 0; /* default is allgatherv */
2635   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2636   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2637 
2638   /* Get runtime option */
2639   if (product->api_user) {
2640     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
2641     PetscCall(PetscOptionsEList("-matmattransmult_mpidense_mpidense_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2642     PetscOptionsEnd();
2643   } else {
2644     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
2645     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2646     PetscOptionsEnd();
2647   }
2648   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2649 
2650   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_MPIDense_MPIDense;
2651   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2652   PetscFunctionReturn(PETSC_SUCCESS);
2653 }
2654 
2655 static PetscErrorCode MatProductSetFromOptions_MPIDense(Mat C)
2656 {
2657   Mat_Product *product = C->product;
2658 
2659   PetscFunctionBegin;
2660   switch (product->type) {
2661   case MATPRODUCT_AB:
2662     PetscCall(MatProductSetFromOptions_MPIDense_AB(C));
2663     break;
2664   case MATPRODUCT_AtB:
2665     PetscCall(MatProductSetFromOptions_MPIDense_AtB(C));
2666     break;
2667   case MATPRODUCT_ABt:
2668     PetscCall(MatProductSetFromOptions_MPIDense_ABt(C));
2669     break;
2670   default:
2671     break;
2672   }
2673   PetscFunctionReturn(PETSC_SUCCESS);
2674 }
2675