xref: /petsc/src/mat/impls/dense/mpi/mpidense.c (revision 4dbf25a8fa98e38799e7b47dcb2d8a9309975f41)
1 /*
2    Basic functions for basic parallel dense matrices.
3    Portions of this code are under:
4    Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
5 */
6 
7 #include <../src/mat/impls/dense/mpi/mpidense.h> /*I   "petscmat.h"  I*/
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscblaslapack.h>
10 #include <petsc/private/vecimpl.h>
11 #include <petscdevice.h>
12 #include <petsc/private/deviceimpl.h>
13 
14 /*@
15   MatDenseGetLocalMatrix - For a `MATMPIDENSE` or `MATSEQDENSE` matrix returns the sequential
16   matrix that represents the operator. For sequential matrices it returns itself.
17 
18   Input Parameter:
19 . A - the sequential or MPI `MATDENSE` matrix
20 
21   Output Parameter:
22 . B - the inner matrix
23 
24   Level: intermediate
25 
26 .seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MATMPIDENSE`, `MATSEQDENSE`
27 @*/
28 PetscErrorCode MatDenseGetLocalMatrix(Mat A, Mat *B)
29 {
30   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
31   PetscBool     flg;
32 
33   PetscFunctionBegin;
34   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
35   PetscAssertPointer(B, 2);
36   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIDENSE, &flg));
37   if (flg) *B = mat->A;
38   else {
39     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQDENSE, &flg));
40     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for matrix type %s", ((PetscObject)A)->type_name);
41     *B = A;
42   }
43   PetscFunctionReturn(PETSC_SUCCESS);
44 }
45 
46 static PetscErrorCode MatCopy_MPIDense(Mat A, Mat B, MatStructure s)
47 {
48   Mat_MPIDense *Amat = (Mat_MPIDense *)A->data;
49   Mat_MPIDense *Bmat = (Mat_MPIDense *)B->data;
50 
51   PetscFunctionBegin;
52   PetscCall(MatCopy(Amat->A, Bmat->A, s));
53   PetscFunctionReturn(PETSC_SUCCESS);
54 }
55 
56 PetscErrorCode MatShift_MPIDense(Mat A, PetscScalar alpha)
57 {
58   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
59   PetscInt      j, lda, rstart = A->rmap->rstart, rend = A->rmap->rend, rend2;
60   PetscScalar  *v;
61 
62   PetscFunctionBegin;
63   PetscCall(MatDenseGetArray(mat->A, &v));
64   PetscCall(MatDenseGetLDA(mat->A, &lda));
65   rend2 = PetscMin(rend, A->cmap->N);
66   if (rend2 > rstart) {
67     for (j = rstart; j < rend2; j++) v[j - rstart + j * lda] += alpha;
68     PetscCall(PetscLogFlops(rend2 - rstart));
69   }
70   PetscCall(MatDenseRestoreArray(mat->A, &v));
71   PetscFunctionReturn(PETSC_SUCCESS);
72 }
73 
74 static PetscErrorCode MatGetRow_MPIDense(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
75 {
76   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
77   PetscInt      lrow, rstart = A->rmap->rstart, rend = A->rmap->rend;
78 
79   PetscFunctionBegin;
80   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "only local rows");
81   lrow = row - rstart;
82   PetscCall(MatGetRow(mat->A, lrow, nz, (const PetscInt **)idx, (const PetscScalar **)v));
83   PetscFunctionReturn(PETSC_SUCCESS);
84 }
85 
86 static PetscErrorCode MatRestoreRow_MPIDense(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
87 {
88   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
89   PetscInt      lrow, rstart = A->rmap->rstart, rend = A->rmap->rend;
90 
91   PetscFunctionBegin;
92   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "only local rows");
93   lrow = row - rstart;
94   PetscCall(MatRestoreRow(mat->A, lrow, nz, (const PetscInt **)idx, (const PetscScalar **)v));
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
98 static PetscErrorCode MatGetDiagonalBlock_MPIDense(Mat A, Mat *a)
99 {
100   Mat_MPIDense *mdn = (Mat_MPIDense *)A->data;
101   PetscInt      m = A->rmap->n, rstart = A->rmap->rstart;
102   PetscScalar  *array;
103   MPI_Comm      comm;
104   PetscBool     flg;
105   Mat           B;
106 
107   PetscFunctionBegin;
108   PetscCall(MatHasCongruentLayouts(A, &flg));
109   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only square matrices supported.");
110   PetscCall(PetscObjectQuery((PetscObject)A, "DiagonalBlock", (PetscObject *)&B));
111   if (!B) { /* This should use MatDenseGetSubMatrix (not create), but we would need a call like MatRestoreDiagonalBlock */
112 #if PetscDefined(HAVE_CUDA)
113     PetscCall(PetscObjectTypeCompare((PetscObject)mdn->A, MATSEQDENSECUDA, &flg));
114     PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not coded for %s. Send an email to petsc-dev@mcs.anl.gov to request this feature", MATSEQDENSECUDA);
115 #elif PetscDefined(HAVE_HIP)
116     PetscCall(PetscObjectTypeCompare((PetscObject)mdn->A, MATSEQDENSEHIP, &flg));
117     PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not coded for %s. Send an email to petsc-dev@mcs.anl.gov to request this feature", MATSEQDENSEHIP);
118 #endif
119     PetscCall(PetscObjectGetComm((PetscObject)mdn->A, &comm));
120     PetscCall(MatCreate(comm, &B));
121     PetscCall(MatSetSizes(B, m, m, m, m));
122     PetscCall(MatSetType(B, ((PetscObject)mdn->A)->type_name));
123     PetscCall(MatDenseGetArrayRead(mdn->A, (const PetscScalar **)&array));
124     PetscCall(MatSeqDenseSetPreallocation(B, array + m * rstart));
125     PetscCall(MatDenseRestoreArrayRead(mdn->A, (const PetscScalar **)&array));
126     PetscCall(PetscObjectCompose((PetscObject)A, "DiagonalBlock", (PetscObject)B));
127     *a = B;
128     PetscCall(MatDestroy(&B));
129   } else *a = B;
130   PetscFunctionReturn(PETSC_SUCCESS);
131 }
132 
133 static PetscErrorCode MatSetValues_MPIDense(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], const PetscScalar v[], InsertMode addv)
134 {
135   Mat_MPIDense *A = (Mat_MPIDense *)mat->data;
136   PetscInt      i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend, row;
137   PetscBool     roworiented = A->roworiented;
138 
139   PetscFunctionBegin;
140   for (i = 0; i < m; i++) {
141     if (idxm[i] < 0) continue;
142     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large");
143     if (idxm[i] >= rstart && idxm[i] < rend) {
144       row = idxm[i] - rstart;
145       if (roworiented) {
146         PetscCall(MatSetValues(A->A, 1, &row, n, idxn, PetscSafePointerPlusOffset(v, i * n), addv));
147       } else {
148         for (j = 0; j < n; j++) {
149           if (idxn[j] < 0) continue;
150           PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large");
151           PetscCall(MatSetValues(A->A, 1, &row, 1, &idxn[j], PetscSafePointerPlusOffset(v, i + j * m), addv));
152         }
153       }
154     } else if (!A->donotstash) {
155       mat->assembled = PETSC_FALSE;
156       if (roworiented) {
157         PetscCall(MatStashValuesRow_Private(&mat->stash, idxm[i], n, idxn, PetscSafePointerPlusOffset(v, i * n), PETSC_FALSE));
158       } else {
159         PetscCall(MatStashValuesCol_Private(&mat->stash, idxm[i], n, idxn, PetscSafePointerPlusOffset(v, i), m, PETSC_FALSE));
160       }
161     }
162   }
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
166 static PetscErrorCode MatGetValues_MPIDense(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
167 {
168   Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
169   PetscInt      i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend, row;
170 
171   PetscFunctionBegin;
172   for (i = 0; i < m; i++) {
173     if (idxm[i] < 0) continue; /* negative row */
174     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large");
175     if (idxm[i] >= rstart && idxm[i] < rend) {
176       row = idxm[i] - rstart;
177       for (j = 0; j < n; j++) {
178         if (idxn[j] < 0) continue; /* negative column */
179         PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large");
180         PetscCall(MatGetValues(mdn->A, 1, &row, 1, &idxn[j], v + i * n + j));
181       }
182     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported");
183   }
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 static PetscErrorCode MatDenseGetLDA_MPIDense(Mat A, PetscInt *lda)
188 {
189   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
190 
191   PetscFunctionBegin;
192   PetscCall(MatDenseGetLDA(a->A, lda));
193   PetscFunctionReturn(PETSC_SUCCESS);
194 }
195 
196 static PetscErrorCode MatDenseSetLDA_MPIDense(Mat A, PetscInt lda)
197 {
198   Mat_MPIDense *a     = (Mat_MPIDense *)A->data;
199   MatType       mtype = MATSEQDENSE;
200 
201   PetscFunctionBegin;
202   if (!a->A) {
203     PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
204     PetscCall(PetscLayoutSetUp(A->rmap));
205     PetscCall(PetscLayoutSetUp(A->cmap));
206     PetscCall(MatCreate(PETSC_COMM_SELF, &a->A));
207     PetscCall(MatSetSizes(a->A, A->rmap->n, A->cmap->N, A->rmap->n, A->cmap->N));
208 #if PetscDefined(HAVE_CUDA)
209     PetscBool iscuda;
210     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIDENSECUDA, &iscuda));
211     if (iscuda) mtype = MATSEQDENSECUDA;
212 #elif PetscDefined(HAVE_HIP)
213     PetscBool iship;
214     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIDENSEHIP, &iship));
215     if (iship) mtype = MATSEQDENSEHIP;
216 #endif
217     PetscCall(MatSetType(a->A, mtype));
218   }
219   PetscCall(MatDenseSetLDA(a->A, lda));
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 static PetscErrorCode MatDenseGetArray_MPIDense(Mat A, PetscScalar **array)
224 {
225   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
226 
227   PetscFunctionBegin;
228   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
229   PetscCall(MatDenseGetArray(a->A, array));
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
233 static PetscErrorCode MatDenseGetArrayRead_MPIDense(Mat A, PetscScalar **array)
234 {
235   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
236 
237   PetscFunctionBegin;
238   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
239   PetscCall(MatDenseGetArrayRead(a->A, (const PetscScalar **)array));
240   PetscFunctionReturn(PETSC_SUCCESS);
241 }
242 
243 static PetscErrorCode MatDenseGetArrayWrite_MPIDense(Mat A, PetscScalar **array)
244 {
245   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
246 
247   PetscFunctionBegin;
248   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
249   PetscCall(MatDenseGetArrayWrite(a->A, array));
250   PetscFunctionReturn(PETSC_SUCCESS);
251 }
252 
253 static PetscErrorCode MatDensePlaceArray_MPIDense(Mat A, const PetscScalar *array)
254 {
255   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
256 
257   PetscFunctionBegin;
258   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
259   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
260   PetscCall(MatDensePlaceArray(a->A, array));
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 static PetscErrorCode MatDenseResetArray_MPIDense(Mat A)
265 {
266   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
267 
268   PetscFunctionBegin;
269   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
270   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
271   PetscCall(MatDenseResetArray(a->A));
272   PetscFunctionReturn(PETSC_SUCCESS);
273 }
274 
275 static PetscErrorCode MatDenseReplaceArray_MPIDense(Mat A, const PetscScalar *array)
276 {
277   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
278 
279   PetscFunctionBegin;
280   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
281   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
282   PetscCall(MatDenseReplaceArray(a->A, array));
283   PetscFunctionReturn(PETSC_SUCCESS);
284 }
285 
286 static PetscErrorCode MatCreateSubMatrix_MPIDense(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
287 {
288   Mat_MPIDense      *mat = (Mat_MPIDense *)A->data, *newmatd;
289   PetscInt           lda, i, j, rstart, rend, nrows, ncols, Ncols, nlrows, nlcols;
290   const PetscInt    *irow, *icol;
291   const PetscScalar *v;
292   PetscScalar       *bv;
293   Mat                newmat;
294   IS                 iscol_local;
295   MPI_Comm           comm_is, comm_mat;
296 
297   PetscFunctionBegin;
298   PetscCall(PetscObjectGetComm((PetscObject)A, &comm_mat));
299   PetscCall(PetscObjectGetComm((PetscObject)iscol, &comm_is));
300   PetscCheck(comm_mat == comm_is, PETSC_COMM_SELF, PETSC_ERR_ARG_NOTSAMECOMM, "IS communicator must match matrix communicator");
301 
302   PetscCall(ISAllGather(iscol, &iscol_local));
303   PetscCall(ISGetIndices(isrow, &irow));
304   PetscCall(ISGetIndices(iscol_local, &icol));
305   PetscCall(ISGetLocalSize(isrow, &nrows));
306   PetscCall(ISGetLocalSize(iscol, &ncols));
307   PetscCall(ISGetSize(iscol, &Ncols)); /* global number of columns, size of iscol_local */
308 
309   /* No parallel redistribution currently supported! Should really check each index set
310      to confirm that it is OK.  ... Currently supports only submatrix same partitioning as
311      original matrix! */
312 
313   PetscCall(MatGetLocalSize(A, &nlrows, &nlcols));
314   PetscCall(MatGetOwnershipRange(A, &rstart, &rend));
315 
316   /* Check submatrix call */
317   if (scall == MAT_REUSE_MATRIX) {
318     /* SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Reused submatrix wrong size"); */
319     /* Really need to test rows and column sizes! */
320     newmat = *B;
321   } else {
322     /* Create and fill new matrix */
323     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &newmat));
324     PetscCall(MatSetSizes(newmat, nrows, ncols, PETSC_DECIDE, Ncols));
325     PetscCall(MatSetType(newmat, ((PetscObject)A)->type_name));
326     PetscCall(MatMPIDenseSetPreallocation(newmat, NULL));
327   }
328 
329   /* Now extract the data pointers and do the copy, column at a time */
330   newmatd = (Mat_MPIDense *)newmat->data;
331   PetscCall(MatDenseGetArray(newmatd->A, &bv));
332   PetscCall(MatDenseGetArrayRead(mat->A, &v));
333   PetscCall(MatDenseGetLDA(mat->A, &lda));
334   for (i = 0; i < Ncols; i++) {
335     const PetscScalar *av = v + lda * icol[i];
336     for (j = 0; j < nrows; j++) *bv++ = av[irow[j] - rstart];
337   }
338   PetscCall(MatDenseRestoreArrayRead(mat->A, &v));
339   PetscCall(MatDenseRestoreArray(newmatd->A, &bv));
340 
341   /* Assemble the matrices so that the correct flags are set */
342   PetscCall(MatAssemblyBegin(newmat, MAT_FINAL_ASSEMBLY));
343   PetscCall(MatAssemblyEnd(newmat, MAT_FINAL_ASSEMBLY));
344 
345   /* Free work space */
346   PetscCall(ISRestoreIndices(isrow, &irow));
347   PetscCall(ISRestoreIndices(iscol_local, &icol));
348   PetscCall(ISDestroy(&iscol_local));
349   *B = newmat;
350   PetscFunctionReturn(PETSC_SUCCESS);
351 }
352 
353 static PetscErrorCode MatDenseRestoreArray_MPIDense(Mat A, PetscScalar **array)
354 {
355   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
356 
357   PetscFunctionBegin;
358   PetscCall(MatDenseRestoreArray(a->A, array));
359   PetscFunctionReturn(PETSC_SUCCESS);
360 }
361 
362 static PetscErrorCode MatDenseRestoreArrayRead_MPIDense(Mat A, PetscScalar **array)
363 {
364   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
365 
366   PetscFunctionBegin;
367   PetscCall(MatDenseRestoreArrayRead(a->A, (const PetscScalar **)array));
368   PetscFunctionReturn(PETSC_SUCCESS);
369 }
370 
371 static PetscErrorCode MatDenseRestoreArrayWrite_MPIDense(Mat A, PetscScalar **array)
372 {
373   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
374 
375   PetscFunctionBegin;
376   PetscCall(MatDenseRestoreArrayWrite(a->A, array));
377   PetscFunctionReturn(PETSC_SUCCESS);
378 }
379 
380 static PetscErrorCode MatAssemblyBegin_MPIDense(Mat mat, MatAssemblyType mode)
381 {
382   Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
383   PetscInt      nstash, reallocs;
384 
385   PetscFunctionBegin;
386   if (mdn->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
387 
388   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
389   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
390   PetscCall(PetscInfo(mdn->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
391   PetscFunctionReturn(PETSC_SUCCESS);
392 }
393 
394 static PetscErrorCode MatAssemblyEnd_MPIDense(Mat mat, MatAssemblyType mode)
395 {
396   Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
397   PetscInt      i, *row, *col, flg, j, rstart, ncols;
398   PetscMPIInt   n;
399   PetscScalar  *val;
400 
401   PetscFunctionBegin;
402   if (!mdn->donotstash && !mat->nooffprocentries) {
403     /*  wait on receives */
404     while (1) {
405       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
406       if (!flg) break;
407 
408       for (i = 0; i < n;) {
409         /* Now identify the consecutive vals belonging to the same row */
410         for (j = i, rstart = row[j]; j < n; j++) {
411           if (row[j] != rstart) break;
412         }
413         if (j < n) ncols = j - i;
414         else ncols = n - i;
415         /* Now assemble all these values with a single function call */
416         PetscCall(MatSetValues_MPIDense(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
417         i = j;
418       }
419     }
420     PetscCall(MatStashScatterEnd_Private(&mat->stash));
421   }
422 
423   PetscCall(MatAssemblyBegin(mdn->A, mode));
424   PetscCall(MatAssemblyEnd(mdn->A, mode));
425   PetscFunctionReturn(PETSC_SUCCESS);
426 }
427 
428 static PetscErrorCode MatZeroEntries_MPIDense(Mat A)
429 {
430   Mat_MPIDense *l = (Mat_MPIDense *)A->data;
431 
432   PetscFunctionBegin;
433   PetscCall(MatZeroEntries(l->A));
434   PetscFunctionReturn(PETSC_SUCCESS);
435 }
436 
437 static PetscErrorCode MatZeroRows_MPIDense(Mat A, PetscInt n, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
438 {
439   Mat_MPIDense *l = (Mat_MPIDense *)A->data;
440   PetscInt      i, len, *lrows;
441 
442   PetscFunctionBegin;
443   /* get locally owned rows */
444   PetscCall(PetscLayoutMapLocal(A->rmap, n, rows, &len, &lrows, NULL));
445   /* fix right-hand side if needed */
446   if (x && b) {
447     const PetscScalar *xx;
448     PetscScalar       *bb;
449 
450     PetscCall(VecGetArrayRead(x, &xx));
451     PetscCall(VecGetArrayWrite(b, &bb));
452     for (i = 0; i < len; ++i) bb[lrows[i]] = diag * xx[lrows[i]];
453     PetscCall(VecRestoreArrayRead(x, &xx));
454     PetscCall(VecRestoreArrayWrite(b, &bb));
455   }
456   PetscCall(MatZeroRows(l->A, len, lrows, 0.0, NULL, NULL));
457   if (diag != 0.0) {
458     Vec d;
459 
460     PetscCall(MatCreateVecs(A, NULL, &d));
461     PetscCall(VecSet(d, diag));
462     PetscCall(MatDiagonalSet(A, d, INSERT_VALUES));
463     PetscCall(VecDestroy(&d));
464   }
465   PetscCall(PetscFree(lrows));
466   PetscFunctionReturn(PETSC_SUCCESS);
467 }
468 
469 PETSC_INTERN PetscErrorCode MatMult_SeqDense(Mat, Vec, Vec);
470 PETSC_INTERN PetscErrorCode MatMultAdd_SeqDense(Mat, Vec, Vec, Vec);
471 PETSC_INTERN PetscErrorCode MatMultTranspose_SeqDense(Mat, Vec, Vec);
472 PETSC_INTERN PetscErrorCode MatMultTransposeAdd_SeqDense(Mat, Vec, Vec, Vec);
473 
474 static PetscErrorCode MatMultColumnRange_MPIDense(Mat mat, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end)
475 {
476   Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
477   const PetscScalar *ax;
478   PetscScalar       *ay;
479   PetscMemType       axmtype, aymtype;
480 
481   PetscFunctionBegin;
482   if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
483   PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
484   PetscCall(VecGetArrayWriteAndMemType(mdn->lvec, &ay, &aymtype));
485   PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
486   PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
487   PetscCall(VecRestoreArrayWriteAndMemType(mdn->lvec, &ay));
488   PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
489   PetscUseMethod(mdn->A, "MatMultColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (mdn->A, mdn->lvec, yy, c_start, c_end));
490   PetscFunctionReturn(PETSC_SUCCESS);
491 }
492 
493 static PetscErrorCode MatMult_MPIDense(Mat mat, Vec xx, Vec yy)
494 {
495   Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
496   const PetscScalar *ax;
497   PetscScalar       *ay;
498   PetscMemType       axmtype, aymtype;
499 
500   PetscFunctionBegin;
501   if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
502   PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
503   PetscCall(VecGetArrayWriteAndMemType(mdn->lvec, &ay, &aymtype));
504   PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
505   PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
506   PetscCall(VecRestoreArrayWriteAndMemType(mdn->lvec, &ay));
507   PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
508   PetscCall((*mdn->A->ops->mult)(mdn->A, mdn->lvec, yy));
509   PetscFunctionReturn(PETSC_SUCCESS);
510 }
511 
512 static PetscErrorCode MatMultAddColumnRange_MPIDense(Mat mat, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end)
513 {
514   Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
515   const PetscScalar *ax;
516   PetscScalar       *ay;
517   PetscMemType       axmtype, aymtype;
518 
519   PetscFunctionBegin;
520   if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
521   PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
522   PetscCall(VecGetArrayAndMemType(mdn->lvec, &ay, &aymtype));
523   PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
524   PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
525   PetscCall(VecRestoreArrayAndMemType(mdn->lvec, &ay));
526   PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
527   PetscUseMethod(mdn->A, "MatMultAddColumnRange_C", (Mat, Vec, Vec, Vec, PetscInt, PetscInt), (mdn->A, mdn->lvec, yy, zz, c_start, c_end));
528   PetscFunctionReturn(PETSC_SUCCESS);
529 }
530 
531 static PetscErrorCode MatMultAdd_MPIDense(Mat mat, Vec xx, Vec yy, Vec zz)
532 {
533   Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
534   const PetscScalar *ax;
535   PetscScalar       *ay;
536   PetscMemType       axmtype, aymtype;
537 
538   PetscFunctionBegin;
539   if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
540   PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
541   PetscCall(VecGetArrayAndMemType(mdn->lvec, &ay, &aymtype));
542   PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
543   PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
544   PetscCall(VecRestoreArrayAndMemType(mdn->lvec, &ay));
545   PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
546   PetscCall((*mdn->A->ops->multadd)(mdn->A, mdn->lvec, yy, zz));
547   PetscFunctionReturn(PETSC_SUCCESS);
548 }
549 
550 static PetscErrorCode MatMultHermitianTransposeColumnRange_MPIDense(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end)
551 {
552   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
553   const PetscScalar *ax;
554   PetscScalar       *ay;
555   PetscMemType       axmtype, aymtype;
556   PetscInt           r_start, r_end;
557   PetscInt           c_start_local, c_end_local;
558 
559   PetscFunctionBegin;
560   if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
561   PetscCall(VecZeroEntries(a->lvec));
562   PetscCall(VecGetOwnershipRange(yy, &r_start, &r_end));
563   c_start_local = PetscMax(c_start, r_start);
564   c_end_local   = PetscMin(c_end, r_end);
565   PetscCall(VecGetArrayAndMemType(yy, &ay, &aymtype));
566   if (c_end_local > c_start_local) {
567     if (PetscMemTypeHost(aymtype)) {
568       PetscCall(PetscArrayzero(&ay[c_start_local], (size_t)(c_end_local - c_start_local)));
569     } else {
570       PetscCall(PetscDeviceRegisterMemory(ay, aymtype, sizeof(*ay) * ((size_t)(r_end - r_start))));
571       PetscCall(PetscDeviceArrayZero(NULL, &ay[c_start_local], (size_t)(c_end_local - c_start_local)));
572     }
573   }
574   PetscUseMethod(a->A, "MatMultHermitianTransposeColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (a->A, xx, a->lvec, c_start, c_end));
575   PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
576   PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
577   PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
578   PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
579   PetscCall(VecRestoreArrayAndMemType(yy, &ay));
580   PetscFunctionReturn(PETSC_SUCCESS);
581 }
582 
583 static PetscErrorCode MatMultTransposeKernel_MPIDense(Mat A, Vec xx, Vec yy, PetscBool herm)
584 {
585   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
586   const PetscScalar *ax;
587   PetscScalar       *ay;
588   PetscMemType       axmtype, aymtype;
589 
590   PetscFunctionBegin;
591   if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
592   PetscCall(VecSet(yy, 0.0));
593   if (herm) PetscCall((*a->A->ops->multhermitiantranspose)(a->A, xx, a->lvec));
594   else PetscCall((*a->A->ops->multtranspose)(a->A, xx, a->lvec));
595   PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
596   PetscCall(VecGetArrayAndMemType(yy, &ay, &aymtype));
597   PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
598   PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
599   PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
600   PetscCall(VecRestoreArrayAndMemType(yy, &ay));
601   PetscFunctionReturn(PETSC_SUCCESS);
602 }
603 
604 static PetscErrorCode MatMultHermitianTransposeAddColumnRange_MPIDense(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end)
605 {
606   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
607   const PetscScalar *ax;
608   PetscScalar       *ay;
609   PetscMemType       axmtype, aymtype;
610 
611   PetscFunctionBegin;
612   if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
613   PetscCall(VecCopy(yy, zz));
614   PetscCall(VecZeroEntries(a->lvec));
615   PetscUseMethod(a->A, "MatMultHermitianTransposeColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (a->A, xx, a->lvec, c_start, c_end));
616   PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
617   PetscCall(VecGetArrayAndMemType(zz, &ay, &aymtype));
618   PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
619   PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
620   PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
621   PetscCall(VecRestoreArrayAndMemType(zz, &ay));
622   PetscFunctionReturn(PETSC_SUCCESS);
623 }
624 
625 static PetscErrorCode MatMultTransposeAddKernel_MPIDense(Mat A, Vec xx, Vec yy, Vec zz, PetscBool herm)
626 {
627   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
628   const PetscScalar *ax;
629   PetscScalar       *ay;
630   PetscMemType       axmtype, aymtype;
631 
632   PetscFunctionBegin;
633   if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
634   PetscCall(VecCopy(yy, zz));
635   if (herm) PetscCall((*a->A->ops->multhermitiantranspose)(a->A, xx, a->lvec));
636   else PetscCall((*a->A->ops->multtranspose)(a->A, xx, a->lvec));
637   PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
638   PetscCall(VecGetArrayAndMemType(zz, &ay, &aymtype));
639   PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
640   PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
641   PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
642   PetscCall(VecRestoreArrayAndMemType(zz, &ay));
643   PetscFunctionReturn(PETSC_SUCCESS);
644 }
645 
646 static PetscErrorCode MatMultTranspose_MPIDense(Mat A, Vec xx, Vec yy)
647 {
648   PetscFunctionBegin;
649   PetscCall(MatMultTransposeKernel_MPIDense(A, xx, yy, PETSC_FALSE));
650   PetscFunctionReturn(PETSC_SUCCESS);
651 }
652 
653 static PetscErrorCode MatMultTransposeAdd_MPIDense(Mat A, Vec xx, Vec yy, Vec zz)
654 {
655   PetscFunctionBegin;
656   PetscCall(MatMultTransposeAddKernel_MPIDense(A, xx, yy, zz, PETSC_FALSE));
657   PetscFunctionReturn(PETSC_SUCCESS);
658 }
659 
660 static PetscErrorCode MatMultHermitianTranspose_MPIDense(Mat A, Vec xx, Vec yy)
661 {
662   PetscFunctionBegin;
663   PetscCall(MatMultTransposeKernel_MPIDense(A, xx, yy, PETSC_TRUE));
664   PetscFunctionReturn(PETSC_SUCCESS);
665 }
666 
667 static PetscErrorCode MatMultHermitianTransposeAdd_MPIDense(Mat A, Vec xx, Vec yy, Vec zz)
668 {
669   PetscFunctionBegin;
670   PetscCall(MatMultTransposeAddKernel_MPIDense(A, xx, yy, zz, PETSC_TRUE));
671   PetscFunctionReturn(PETSC_SUCCESS);
672 }
673 
674 PetscErrorCode MatGetDiagonal_MPIDense(Mat A, Vec v)
675 {
676   Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
677   PetscInt           lda, len, i, nl, ng, m = A->rmap->n, radd;
678   PetscScalar       *x;
679   const PetscScalar *av;
680 
681   PetscFunctionBegin;
682   PetscCall(VecGetArray(v, &x));
683   PetscCall(VecGetSize(v, &ng));
684   PetscCheck(ng == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming mat and vec");
685   PetscCall(VecGetLocalSize(v, &nl));
686   len  = PetscMin(a->A->rmap->n, a->A->cmap->n);
687   radd = A->rmap->rstart * m;
688   PetscCall(MatDenseGetArrayRead(a->A, &av));
689   PetscCall(MatDenseGetLDA(a->A, &lda));
690   for (i = 0; i < len; i++) x[i] = av[radd + i * lda + i];
691   PetscCall(MatDenseRestoreArrayRead(a->A, &av));
692   if (nl - i > 0) PetscCall(PetscArrayzero(x + i, nl - i));
693   PetscCall(VecRestoreArray(v, &x));
694   PetscFunctionReturn(PETSC_SUCCESS);
695 }
696 
697 static PetscErrorCode MatDestroy_MPIDense(Mat mat)
698 {
699   Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
700 
701   PetscFunctionBegin;
702   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
703   PetscCall(MatStashDestroy_Private(&mat->stash));
704   PetscCheck(!mdn->vecinuse, PetscObjectComm((PetscObject)mat), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
705   PetscCheck(!mdn->matinuse, PetscObjectComm((PetscObject)mat), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
706   PetscCall(MatDestroy(&mdn->A));
707   PetscCall(VecDestroy(&mdn->lvec));
708   PetscCall(PetscSFDestroy(&mdn->Mvctx));
709   PetscCall(VecDestroy(&mdn->cvec));
710   PetscCall(MatDestroy(&mdn->cmat));
711 
712   PetscCall(PetscFree(mat->data));
713   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
714 
715   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetLDA_C", NULL));
716   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseSetLDA_C", NULL));
717   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArray_C", NULL));
718   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArray_C", NULL));
719   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayRead_C", NULL));
720   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayRead_C", NULL));
721   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayWrite_C", NULL));
722   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayWrite_C", NULL));
723   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDensePlaceArray_C", NULL));
724   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseResetArray_C", NULL));
725   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseReplaceArray_C", NULL));
726   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
727   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpiaij_C", NULL));
728 #if defined(PETSC_HAVE_ELEMENTAL)
729   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_elemental_C", NULL));
730 #endif
731 #if defined(PETSC_HAVE_SCALAPACK)
732   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_scalapack_C", NULL));
733 #endif
734   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIDenseSetPreallocation_C", NULL));
735   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidense_C", NULL));
736   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaij_C", NULL));
737 #if defined(PETSC_HAVE_CUDA)
738   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijcusparse_mpidense_C", NULL));
739   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijcusparse_C", NULL));
740   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensecuda_C", NULL));
741   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidensecuda_mpidense_C", NULL));
742   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidensecuda_C", NULL));
743   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijcusparse_mpidensecuda_C", NULL));
744   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensecuda_mpiaij_C", NULL));
745   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensecuda_mpiaijcusparse_C", NULL));
746   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAGetArray_C", NULL));
747   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAGetArrayRead_C", NULL));
748   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAGetArrayWrite_C", NULL));
749   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDARestoreArray_C", NULL));
750   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDARestoreArrayRead_C", NULL));
751   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDARestoreArrayWrite_C", NULL));
752   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAPlaceArray_C", NULL));
753   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAResetArray_C", NULL));
754   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAReplaceArray_C", NULL));
755   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDASetPreallocation_C", NULL));
756 #endif
757 #if defined(PETSC_HAVE_HIP)
758   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijhipsparse_mpidense_C", NULL));
759   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijhipsparse_C", NULL));
760   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensehip_C", NULL));
761   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidensehip_mpidense_C", NULL));
762   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidensehip_C", NULL));
763   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijhipsparse_mpidensehip_C", NULL));
764   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensehip_mpiaij_C", NULL));
765   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensehip_mpiaijhipsparse_C", NULL));
766   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPGetArray_C", NULL));
767   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPGetArrayRead_C", NULL));
768   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPGetArrayWrite_C", NULL));
769   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPRestoreArray_C", NULL));
770   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPRestoreArrayRead_C", NULL));
771   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPRestoreArrayWrite_C", NULL));
772   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPPlaceArray_C", NULL));
773   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPResetArray_C", NULL));
774   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPReplaceArray_C", NULL));
775   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPSetPreallocation_C", NULL));
776 #endif
777   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumn_C", NULL));
778   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumn_C", NULL));
779   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVec_C", NULL));
780   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVec_C", NULL));
781   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecRead_C", NULL));
782   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecRead_C", NULL));
783   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecWrite_C", NULL));
784   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecWrite_C", NULL));
785   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetSubMatrix_C", NULL));
786   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreSubMatrix_C", NULL));
787   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultColumnRange_C", NULL));
788   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultAddColumnRange_C", NULL));
789   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeColumnRange_C", NULL));
790   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeAddColumnRange_C", NULL));
791 
792   PetscCall(PetscObjectCompose((PetscObject)mat, "DiagonalBlock", NULL));
793   PetscFunctionReturn(PETSC_SUCCESS);
794 }
795 
796 #include <petscdraw.h>
797 static PetscErrorCode MatView_MPIDense_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
798 {
799   Mat_MPIDense     *mdn = (Mat_MPIDense *)mat->data;
800   PetscMPIInt       rank;
801   PetscViewerType   vtype;
802   PetscBool         iascii, isdraw;
803   PetscViewer       sviewer;
804   PetscViewerFormat format;
805 
806   PetscFunctionBegin;
807   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
808   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
809   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
810   if (iascii) {
811     PetscCall(PetscViewerGetType(viewer, &vtype));
812     PetscCall(PetscViewerGetFormat(viewer, &format));
813     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
814       MatInfo info;
815       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
816       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
817       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "  [%d] local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %" PetscInt_FMT " \n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
818                                                    (PetscInt)info.memory));
819       PetscCall(PetscViewerFlush(viewer));
820       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
821       if (mdn->Mvctx) PetscCall(PetscSFView(mdn->Mvctx, viewer));
822       PetscFunctionReturn(PETSC_SUCCESS);
823     } else if (format == PETSC_VIEWER_ASCII_INFO) {
824       PetscFunctionReturn(PETSC_SUCCESS);
825     }
826   } else if (isdraw) {
827     PetscDraw draw;
828     PetscBool isnull;
829 
830     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
831     PetscCall(PetscDrawIsNull(draw, &isnull));
832     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
833   }
834 
835   {
836     /* assemble the entire matrix onto first processor. */
837     Mat          A;
838     PetscInt     M = mat->rmap->N, N = mat->cmap->N, m, row, i, nz;
839     PetscInt    *cols;
840     PetscScalar *vals;
841 
842     PetscCall(MatCreate(PetscObjectComm((PetscObject)mat), &A));
843     if (rank == 0) {
844       PetscCall(MatSetSizes(A, M, N, M, N));
845     } else {
846       PetscCall(MatSetSizes(A, 0, 0, M, N));
847     }
848     /* Since this is a temporary matrix, MATMPIDENSE instead of ((PetscObject)A)->type_name here is probably acceptable. */
849     PetscCall(MatSetType(A, MATMPIDENSE));
850     PetscCall(MatMPIDenseSetPreallocation(A, NULL));
851 
852     /* Copy the matrix ... This isn't the most efficient means,
853        but it's quick for now */
854     A->insertmode = INSERT_VALUES;
855 
856     row = mat->rmap->rstart;
857     m   = mdn->A->rmap->n;
858     for (i = 0; i < m; i++) {
859       PetscCall(MatGetRow_MPIDense(mat, row, &nz, &cols, &vals));
860       PetscCall(MatSetValues_MPIDense(A, 1, &row, nz, cols, vals, INSERT_VALUES));
861       PetscCall(MatRestoreRow_MPIDense(mat, row, &nz, &cols, &vals));
862       row++;
863     }
864 
865     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
866     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
867     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
868     if (rank == 0) {
869       PetscCall(PetscObjectSetName((PetscObject)((Mat_MPIDense *)A->data)->A, ((PetscObject)mat)->name));
870       PetscCall(MatView_SeqDense(((Mat_MPIDense *)A->data)->A, sviewer));
871     }
872     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
873     PetscCall(MatDestroy(&A));
874   }
875   PetscFunctionReturn(PETSC_SUCCESS);
876 }
877 
878 static PetscErrorCode MatView_MPIDense(Mat mat, PetscViewer viewer)
879 {
880   PetscBool iascii, isbinary, isdraw, issocket;
881 
882   PetscFunctionBegin;
883   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
884   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
885   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
886   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
887 
888   if (iascii || issocket || isdraw) {
889     PetscCall(MatView_MPIDense_ASCIIorDraworSocket(mat, viewer));
890   } else if (isbinary) PetscCall(MatView_Dense_Binary(mat, viewer));
891   PetscFunctionReturn(PETSC_SUCCESS);
892 }
893 
894 static PetscErrorCode MatGetInfo_MPIDense(Mat A, MatInfoType flag, MatInfo *info)
895 {
896   Mat_MPIDense  *mat = (Mat_MPIDense *)A->data;
897   Mat            mdn = mat->A;
898   PetscLogDouble isend[5], irecv[5];
899 
900   PetscFunctionBegin;
901   info->block_size = 1.0;
902 
903   PetscCall(MatGetInfo(mdn, MAT_LOCAL, info));
904 
905   isend[0] = info->nz_used;
906   isend[1] = info->nz_allocated;
907   isend[2] = info->nz_unneeded;
908   isend[3] = info->memory;
909   isend[4] = info->mallocs;
910   if (flag == MAT_LOCAL) {
911     info->nz_used      = isend[0];
912     info->nz_allocated = isend[1];
913     info->nz_unneeded  = isend[2];
914     info->memory       = isend[3];
915     info->mallocs      = isend[4];
916   } else if (flag == MAT_GLOBAL_MAX) {
917     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)A)));
918 
919     info->nz_used      = irecv[0];
920     info->nz_allocated = irecv[1];
921     info->nz_unneeded  = irecv[2];
922     info->memory       = irecv[3];
923     info->mallocs      = irecv[4];
924   } else if (flag == MAT_GLOBAL_SUM) {
925     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)A)));
926 
927     info->nz_used      = irecv[0];
928     info->nz_allocated = irecv[1];
929     info->nz_unneeded  = irecv[2];
930     info->memory       = irecv[3];
931     info->mallocs      = irecv[4];
932   }
933   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
934   info->fill_ratio_needed = 0;
935   info->factor_mallocs    = 0;
936   PetscFunctionReturn(PETSC_SUCCESS);
937 }
938 
939 static PetscErrorCode MatSetOption_MPIDense(Mat A, MatOption op, PetscBool flg)
940 {
941   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
942 
943   PetscFunctionBegin;
944   switch (op) {
945   case MAT_NEW_NONZERO_LOCATIONS:
946   case MAT_NEW_NONZERO_LOCATION_ERR:
947   case MAT_NEW_NONZERO_ALLOCATION_ERR:
948     MatCheckPreallocated(A, 1);
949     PetscCall(MatSetOption(a->A, op, flg));
950     break;
951   case MAT_ROW_ORIENTED:
952     MatCheckPreallocated(A, 1);
953     a->roworiented = flg;
954     PetscCall(MatSetOption(a->A, op, flg));
955     break;
956   case MAT_IGNORE_OFF_PROC_ENTRIES:
957     a->donotstash = flg;
958     break;
959   case MAT_SYMMETRIC:
960   case MAT_STRUCTURALLY_SYMMETRIC:
961   case MAT_HERMITIAN:
962   case MAT_SYMMETRY_ETERNAL:
963   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
964   case MAT_SPD:
965   case MAT_SPD_ETERNAL:
966     /* if the diagonal matrix is square it inherits some of the properties above */
967     if (a->A && A->rmap->n == A->cmap->n) PetscCall(MatSetOption(a->A, op, flg));
968     break;
969   default:
970     break;
971   }
972   PetscFunctionReturn(PETSC_SUCCESS);
973 }
974 
975 static PetscErrorCode MatDiagonalScale_MPIDense(Mat A, Vec ll, Vec rr)
976 {
977   Mat_MPIDense      *mdn = (Mat_MPIDense *)A->data;
978   const PetscScalar *l;
979   PetscScalar        x, *v, *vv, *r;
980   PetscInt           i, j, s2a, s3a, s2, s3, m = mdn->A->rmap->n, n = mdn->A->cmap->n, lda;
981 
982   PetscFunctionBegin;
983   PetscCall(MatDenseGetArray(mdn->A, &vv));
984   PetscCall(MatDenseGetLDA(mdn->A, &lda));
985   PetscCall(MatGetLocalSize(A, &s2, &s3));
986   if (ll) {
987     PetscCall(VecGetLocalSize(ll, &s2a));
988     PetscCheck(s2a == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector non-conforming local size, %" PetscInt_FMT " != %" PetscInt_FMT, s2a, s2);
989     PetscCall(VecGetArrayRead(ll, &l));
990     for (i = 0; i < m; i++) {
991       x = l[i];
992       v = vv + i;
993       for (j = 0; j < n; j++) {
994         (*v) *= x;
995         v += lda;
996       }
997     }
998     PetscCall(VecRestoreArrayRead(ll, &l));
999     PetscCall(PetscLogFlops(1.0 * n * m));
1000   }
1001   if (rr) {
1002     const PetscScalar *ar;
1003 
1004     PetscCall(VecGetLocalSize(rr, &s3a));
1005     PetscCheck(s3a == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vec non-conforming local size, %" PetscInt_FMT " != %" PetscInt_FMT ".", s3a, s3);
1006     PetscCall(VecGetArrayRead(rr, &ar));
1007     if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
1008     PetscCall(VecGetArray(mdn->lvec, &r));
1009     PetscCall(PetscSFBcastBegin(mdn->Mvctx, MPIU_SCALAR, ar, r, MPI_REPLACE));
1010     PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ar, r, MPI_REPLACE));
1011     PetscCall(VecRestoreArrayRead(rr, &ar));
1012     for (i = 0; i < n; i++) {
1013       x = r[i];
1014       v = vv + i * lda;
1015       for (j = 0; j < m; j++) (*v++) *= x;
1016     }
1017     PetscCall(VecRestoreArray(mdn->lvec, &r));
1018     PetscCall(PetscLogFlops(1.0 * n * m));
1019   }
1020   PetscCall(MatDenseRestoreArray(mdn->A, &vv));
1021   PetscFunctionReturn(PETSC_SUCCESS);
1022 }
1023 
1024 static PetscErrorCode MatNorm_MPIDense(Mat A, NormType type, PetscReal *nrm)
1025 {
1026   Mat_MPIDense      *mdn = (Mat_MPIDense *)A->data;
1027   PetscInt           i, j;
1028   PetscMPIInt        size;
1029   PetscReal          sum = 0.0;
1030   const PetscScalar *av, *v;
1031 
1032   PetscFunctionBegin;
1033   PetscCall(MatDenseGetArrayRead(mdn->A, &av));
1034   v = av;
1035   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1036   if (size == 1) {
1037     PetscCall(MatNorm(mdn->A, type, nrm));
1038   } else {
1039     if (type == NORM_FROBENIUS) {
1040       for (i = 0; i < mdn->A->cmap->n * mdn->A->rmap->n; i++) {
1041         sum += PetscRealPart(PetscConj(*v) * (*v));
1042         v++;
1043       }
1044       PetscCallMPI(MPIU_Allreduce(&sum, nrm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1045       *nrm = PetscSqrtReal(*nrm);
1046       PetscCall(PetscLogFlops(2.0 * mdn->A->cmap->n * mdn->A->rmap->n));
1047     } else if (type == NORM_1) {
1048       PetscReal *tmp;
1049 
1050       PetscCall(PetscCalloc1(A->cmap->N, &tmp));
1051       *nrm = 0.0;
1052       v    = av;
1053       for (j = 0; j < mdn->A->cmap->n; j++) {
1054         for (i = 0; i < mdn->A->rmap->n; i++) {
1055           tmp[j] += PetscAbsScalar(*v);
1056           v++;
1057         }
1058       }
1059       PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, tmp, A->cmap->N, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1060       for (j = 0; j < A->cmap->N; j++) {
1061         if (tmp[j] > *nrm) *nrm = tmp[j];
1062       }
1063       PetscCall(PetscFree(tmp));
1064       PetscCall(PetscLogFlops(A->cmap->n * A->rmap->n));
1065     } else if (type == NORM_INFINITY) { /* max row norm */
1066       PetscCall(MatNorm(mdn->A, type, nrm));
1067       PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, nrm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
1068     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "No support for two norm");
1069   }
1070   PetscCall(MatDenseRestoreArrayRead(mdn->A, &av));
1071   PetscFunctionReturn(PETSC_SUCCESS);
1072 }
1073 
1074 static PetscErrorCode MatTranspose_MPIDense(Mat A, MatReuse reuse, Mat *matout)
1075 {
1076   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1077   Mat           B;
1078   PetscInt      M = A->rmap->N, N = A->cmap->N, m, n, *rwork, rstart = A->rmap->rstart;
1079   PetscInt      j, i, lda;
1080   PetscScalar  *v;
1081 
1082   PetscFunctionBegin;
1083   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1084   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
1085     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1086     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1087     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1088     PetscCall(MatMPIDenseSetPreallocation(B, NULL));
1089   } else B = *matout;
1090 
1091   m = a->A->rmap->n;
1092   n = a->A->cmap->n;
1093   PetscCall(MatDenseGetArrayRead(a->A, (const PetscScalar **)&v));
1094   PetscCall(MatDenseGetLDA(a->A, &lda));
1095   PetscCall(PetscMalloc1(m, &rwork));
1096   for (i = 0; i < m; i++) rwork[i] = rstart + i;
1097   for (j = 0; j < n; j++) {
1098     PetscCall(MatSetValues(B, 1, &j, m, rwork, v, INSERT_VALUES));
1099     v = PetscSafePointerPlusOffset(v, lda);
1100   }
1101   PetscCall(MatDenseRestoreArrayRead(a->A, (const PetscScalar **)&v));
1102   PetscCall(PetscFree(rwork));
1103   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1104   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1105   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1106     *matout = B;
1107   } else {
1108     PetscCall(MatHeaderMerge(A, &B));
1109   }
1110   PetscFunctionReturn(PETSC_SUCCESS);
1111 }
1112 
1113 static PetscErrorCode       MatDuplicate_MPIDense(Mat, MatDuplicateOption, Mat *);
1114 PETSC_INTERN PetscErrorCode MatScale_MPIDense(Mat, PetscScalar);
1115 
1116 static PetscErrorCode MatSetUp_MPIDense(Mat A)
1117 {
1118   PetscFunctionBegin;
1119   PetscCall(PetscLayoutSetUp(A->rmap));
1120   PetscCall(PetscLayoutSetUp(A->cmap));
1121   if (!A->preallocated) PetscCall(MatMPIDenseSetPreallocation(A, NULL));
1122   PetscFunctionReturn(PETSC_SUCCESS);
1123 }
1124 
1125 static PetscErrorCode MatAXPY_MPIDense(Mat Y, PetscScalar alpha, Mat X, MatStructure str)
1126 {
1127   Mat_MPIDense *A = (Mat_MPIDense *)Y->data, *B = (Mat_MPIDense *)X->data;
1128 
1129   PetscFunctionBegin;
1130   PetscCall(MatAXPY(A->A, alpha, B->A, str));
1131   PetscFunctionReturn(PETSC_SUCCESS);
1132 }
1133 
1134 static PetscErrorCode MatConjugate_MPIDense(Mat mat)
1135 {
1136   Mat_MPIDense *a = (Mat_MPIDense *)mat->data;
1137 
1138   PetscFunctionBegin;
1139   PetscCall(MatConjugate(a->A));
1140   PetscFunctionReturn(PETSC_SUCCESS);
1141 }
1142 
1143 static PetscErrorCode MatRealPart_MPIDense(Mat A)
1144 {
1145   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1146 
1147   PetscFunctionBegin;
1148   PetscCall(MatRealPart(a->A));
1149   PetscFunctionReturn(PETSC_SUCCESS);
1150 }
1151 
1152 static PetscErrorCode MatImaginaryPart_MPIDense(Mat A)
1153 {
1154   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1155 
1156   PetscFunctionBegin;
1157   PetscCall(MatImaginaryPart(a->A));
1158   PetscFunctionReturn(PETSC_SUCCESS);
1159 }
1160 
1161 static PetscErrorCode MatGetColumnVector_MPIDense(Mat A, Vec v, PetscInt col)
1162 {
1163   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1164 
1165   PetscFunctionBegin;
1166   PetscCheck(a->A, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Missing local matrix");
1167   PetscCheck(a->A->ops->getcolumnvector, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Missing get column operation");
1168   PetscCall((*a->A->ops->getcolumnvector)(a->A, v, col));
1169   PetscFunctionReturn(PETSC_SUCCESS);
1170 }
1171 
1172 PETSC_INTERN PetscErrorCode MatGetColumnReductions_SeqDense(Mat, PetscInt, PetscReal *);
1173 
1174 static PetscErrorCode MatGetColumnReductions_MPIDense(Mat A, PetscInt type, PetscReal *reductions)
1175 {
1176   PetscInt      i, m, n;
1177   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1178 
1179   PetscFunctionBegin;
1180   PetscCall(MatGetSize(A, &m, &n));
1181   if (type == REDUCTION_MEAN_REALPART) {
1182     PetscCall(MatGetColumnReductions_SeqDense(a->A, (PetscInt)REDUCTION_SUM_REALPART, reductions));
1183   } else if (type == REDUCTION_MEAN_IMAGINARYPART) {
1184     PetscCall(MatGetColumnReductions_SeqDense(a->A, (PetscInt)REDUCTION_SUM_IMAGINARYPART, reductions));
1185   } else {
1186     PetscCall(MatGetColumnReductions_SeqDense(a->A, type, reductions));
1187   }
1188   if (type == NORM_2) {
1189     for (i = 0; i < n; i++) reductions[i] *= reductions[i];
1190   }
1191   if (type == NORM_INFINITY) {
1192     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, reductions, n, MPIU_REAL, MPIU_MAX, A->hdr.comm));
1193   } else {
1194     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, reductions, n, MPIU_REAL, MPIU_SUM, A->hdr.comm));
1195   }
1196   if (type == NORM_2) {
1197     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
1198   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
1199     for (i = 0; i < n; i++) reductions[i] /= m;
1200   }
1201   PetscFunctionReturn(PETSC_SUCCESS);
1202 }
1203 
1204 static PetscErrorCode MatSetRandom_MPIDense(Mat x, PetscRandom rctx)
1205 {
1206   Mat_MPIDense *d = (Mat_MPIDense *)x->data;
1207 
1208   PetscFunctionBegin;
1209   PetscCall(MatSetRandom(d->A, rctx));
1210 #if defined(PETSC_HAVE_DEVICE)
1211   x->offloadmask = d->A->offloadmask;
1212 #endif
1213   PetscFunctionReturn(PETSC_SUCCESS);
1214 }
1215 
1216 static PetscErrorCode MatMissingDiagonal_MPIDense(Mat A, PetscBool *missing, PetscInt *d)
1217 {
1218   PetscFunctionBegin;
1219   *missing = PETSC_FALSE;
1220   PetscFunctionReturn(PETSC_SUCCESS);
1221 }
1222 
1223 static PetscErrorCode MatMatTransposeMultSymbolic_MPIDense_MPIDense(Mat, Mat, PetscReal, Mat);
1224 static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense(Mat, Mat, Mat);
1225 static PetscErrorCode MatTransposeMatMultSymbolic_MPIDense_MPIDense(Mat, Mat, PetscReal, Mat);
1226 static PetscErrorCode MatTransposeMatMultNumeric_MPIDense_MPIDense(Mat, Mat, Mat);
1227 static PetscErrorCode MatEqual_MPIDense(Mat, Mat, PetscBool *);
1228 static PetscErrorCode MatLoad_MPIDense(Mat, PetscViewer);
1229 static PetscErrorCode MatProductSetFromOptions_MPIDense(Mat);
1230 
1231 static struct _MatOps MatOps_Values = {MatSetValues_MPIDense,
1232                                        MatGetRow_MPIDense,
1233                                        MatRestoreRow_MPIDense,
1234                                        MatMult_MPIDense,
1235                                        /*  4*/ MatMultAdd_MPIDense,
1236                                        MatMultTranspose_MPIDense,
1237                                        MatMultTransposeAdd_MPIDense,
1238                                        NULL,
1239                                        NULL,
1240                                        NULL,
1241                                        /* 10*/ NULL,
1242                                        NULL,
1243                                        NULL,
1244                                        NULL,
1245                                        MatTranspose_MPIDense,
1246                                        /* 15*/ MatGetInfo_MPIDense,
1247                                        MatEqual_MPIDense,
1248                                        MatGetDiagonal_MPIDense,
1249                                        MatDiagonalScale_MPIDense,
1250                                        MatNorm_MPIDense,
1251                                        /* 20*/ MatAssemblyBegin_MPIDense,
1252                                        MatAssemblyEnd_MPIDense,
1253                                        MatSetOption_MPIDense,
1254                                        MatZeroEntries_MPIDense,
1255                                        /* 24*/ MatZeroRows_MPIDense,
1256                                        NULL,
1257                                        NULL,
1258                                        NULL,
1259                                        NULL,
1260                                        /* 29*/ MatSetUp_MPIDense,
1261                                        NULL,
1262                                        NULL,
1263                                        MatGetDiagonalBlock_MPIDense,
1264                                        NULL,
1265                                        /* 34*/ MatDuplicate_MPIDense,
1266                                        NULL,
1267                                        NULL,
1268                                        NULL,
1269                                        NULL,
1270                                        /* 39*/ MatAXPY_MPIDense,
1271                                        MatCreateSubMatrices_MPIDense,
1272                                        NULL,
1273                                        MatGetValues_MPIDense,
1274                                        MatCopy_MPIDense,
1275                                        /* 44*/ NULL,
1276                                        MatScale_MPIDense,
1277                                        MatShift_MPIDense,
1278                                        NULL,
1279                                        NULL,
1280                                        /* 49*/ MatSetRandom_MPIDense,
1281                                        NULL,
1282                                        NULL,
1283                                        NULL,
1284                                        NULL,
1285                                        /* 54*/ NULL,
1286                                        NULL,
1287                                        NULL,
1288                                        NULL,
1289                                        NULL,
1290                                        /* 59*/ MatCreateSubMatrix_MPIDense,
1291                                        MatDestroy_MPIDense,
1292                                        MatView_MPIDense,
1293                                        NULL,
1294                                        NULL,
1295                                        /* 64*/ NULL,
1296                                        NULL,
1297                                        NULL,
1298                                        NULL,
1299                                        NULL,
1300                                        /* 69*/ NULL,
1301                                        NULL,
1302                                        NULL,
1303                                        NULL,
1304                                        NULL,
1305                                        /* 74*/ NULL,
1306                                        NULL,
1307                                        NULL,
1308                                        NULL,
1309                                        NULL,
1310                                        /* 79*/ NULL,
1311                                        NULL,
1312                                        NULL,
1313                                        NULL,
1314                                        /* 83*/ MatLoad_MPIDense,
1315                                        NULL,
1316                                        NULL,
1317                                        NULL,
1318                                        NULL,
1319                                        NULL,
1320                                        /* 89*/ NULL,
1321                                        NULL,
1322                                        NULL,
1323                                        NULL,
1324                                        NULL,
1325                                        /* 94*/ NULL,
1326                                        NULL,
1327                                        MatMatTransposeMultSymbolic_MPIDense_MPIDense,
1328                                        MatMatTransposeMultNumeric_MPIDense_MPIDense,
1329                                        NULL,
1330                                        /* 99*/ MatProductSetFromOptions_MPIDense,
1331                                        NULL,
1332                                        NULL,
1333                                        MatConjugate_MPIDense,
1334                                        NULL,
1335                                        /*104*/ NULL,
1336                                        MatRealPart_MPIDense,
1337                                        MatImaginaryPart_MPIDense,
1338                                        NULL,
1339                                        NULL,
1340                                        /*109*/ NULL,
1341                                        NULL,
1342                                        NULL,
1343                                        MatGetColumnVector_MPIDense,
1344                                        MatMissingDiagonal_MPIDense,
1345                                        /*114*/ NULL,
1346                                        NULL,
1347                                        NULL,
1348                                        NULL,
1349                                        NULL,
1350                                        /*119*/ NULL,
1351                                        NULL,
1352                                        MatMultHermitianTranspose_MPIDense,
1353                                        MatMultHermitianTransposeAdd_MPIDense,
1354                                        NULL,
1355                                        /*124*/ NULL,
1356                                        MatGetColumnReductions_MPIDense,
1357                                        NULL,
1358                                        NULL,
1359                                        NULL,
1360                                        /*129*/ NULL,
1361                                        NULL,
1362                                        MatTransposeMatMultSymbolic_MPIDense_MPIDense,
1363                                        MatTransposeMatMultNumeric_MPIDense_MPIDense,
1364                                        NULL,
1365                                        /*134*/ NULL,
1366                                        NULL,
1367                                        NULL,
1368                                        NULL,
1369                                        NULL,
1370                                        /*139*/ NULL,
1371                                        NULL,
1372                                        NULL,
1373                                        NULL,
1374                                        NULL,
1375                                        MatCreateMPIMatConcatenateSeqMat_MPIDense,
1376                                        /*145*/ NULL,
1377                                        NULL,
1378                                        NULL,
1379                                        NULL,
1380                                        NULL,
1381                                        /*150*/ NULL,
1382                                        NULL,
1383                                        NULL,
1384                                        NULL,
1385                                        NULL,
1386                                        /*155*/ NULL,
1387                                        NULL};
1388 
1389 static PetscErrorCode MatMPIDenseSetPreallocation_MPIDense(Mat mat, PetscScalar *data)
1390 {
1391   Mat_MPIDense *a     = (Mat_MPIDense *)mat->data;
1392   MatType       mtype = MATSEQDENSE;
1393 
1394   PetscFunctionBegin;
1395   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)mat), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1396   PetscCall(PetscLayoutSetUp(mat->rmap));
1397   PetscCall(PetscLayoutSetUp(mat->cmap));
1398   if (!a->A) {
1399     PetscCall(MatCreate(PETSC_COMM_SELF, &a->A));
1400     PetscCall(MatSetSizes(a->A, mat->rmap->n, mat->cmap->N, mat->rmap->n, mat->cmap->N));
1401   }
1402 #if defined(PETSC_HAVE_CUDA)
1403   PetscBool iscuda;
1404   PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATMPIDENSECUDA, &iscuda));
1405   if (iscuda) mtype = MATSEQDENSECUDA;
1406 #endif
1407 #if defined(PETSC_HAVE_HIP)
1408   PetscBool iship;
1409   PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATMPIDENSEHIP, &iship));
1410   if (iship) mtype = MATSEQDENSEHIP;
1411 #endif
1412   PetscCall(MatSetType(a->A, mtype));
1413   PetscCall(MatSeqDenseSetPreallocation(a->A, data));
1414 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
1415   mat->offloadmask = a->A->offloadmask;
1416 #endif
1417   mat->preallocated = PETSC_TRUE;
1418   mat->assembled    = PETSC_TRUE;
1419   PetscFunctionReturn(PETSC_SUCCESS);
1420 }
1421 
1422 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIDense(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
1423 {
1424   Mat B, C;
1425 
1426   PetscFunctionBegin;
1427   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &C));
1428   PetscCall(MatConvert_SeqAIJ_SeqDense(C, MATSEQDENSE, MAT_INITIAL_MATRIX, &B));
1429   PetscCall(MatDestroy(&C));
1430   if (reuse == MAT_REUSE_MATRIX) {
1431     C = *newmat;
1432   } else C = NULL;
1433   PetscCall(MatCreateMPIMatConcatenateSeqMat(PetscObjectComm((PetscObject)A), B, A->cmap->n, !C ? MAT_INITIAL_MATRIX : MAT_REUSE_MATRIX, &C));
1434   PetscCall(MatDestroy(&B));
1435   if (reuse == MAT_INPLACE_MATRIX) {
1436     PetscCall(MatHeaderReplace(A, &C));
1437   } else if (reuse == MAT_INITIAL_MATRIX) *newmat = C;
1438   PetscFunctionReturn(PETSC_SUCCESS);
1439 }
1440 
1441 static PetscErrorCode MatConvert_MPIDense_MPIAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
1442 {
1443   Mat B, C;
1444 
1445   PetscFunctionBegin;
1446   PetscCall(MatDenseGetLocalMatrix(A, &C));
1447   PetscCall(MatConvert_SeqDense_SeqAIJ(C, MATSEQAIJ, MAT_INITIAL_MATRIX, &B));
1448   if (reuse == MAT_REUSE_MATRIX) {
1449     C = *newmat;
1450   } else C = NULL;
1451   PetscCall(MatCreateMPIMatConcatenateSeqMat(PetscObjectComm((PetscObject)A), B, A->cmap->n, !C ? MAT_INITIAL_MATRIX : MAT_REUSE_MATRIX, &C));
1452   PetscCall(MatDestroy(&B));
1453   if (reuse == MAT_INPLACE_MATRIX) {
1454     PetscCall(MatHeaderReplace(A, &C));
1455   } else if (reuse == MAT_INITIAL_MATRIX) *newmat = C;
1456   PetscFunctionReturn(PETSC_SUCCESS);
1457 }
1458 
1459 #if defined(PETSC_HAVE_ELEMENTAL)
1460 PETSC_INTERN PetscErrorCode MatConvert_MPIDense_Elemental(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
1461 {
1462   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1463   Mat           mat_elemental;
1464   PetscScalar  *v;
1465   PetscInt      m = A->rmap->n, N = A->cmap->N, rstart = A->rmap->rstart, i, *rows, *cols, lda;
1466 
1467   PetscFunctionBegin;
1468   if (reuse == MAT_REUSE_MATRIX) {
1469     mat_elemental = *newmat;
1470     PetscCall(MatZeroEntries(*newmat));
1471   } else {
1472     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &mat_elemental));
1473     PetscCall(MatSetSizes(mat_elemental, PETSC_DECIDE, PETSC_DECIDE, A->rmap->N, A->cmap->N));
1474     PetscCall(MatSetType(mat_elemental, MATELEMENTAL));
1475     PetscCall(MatSetUp(mat_elemental));
1476     PetscCall(MatSetOption(mat_elemental, MAT_ROW_ORIENTED, PETSC_FALSE));
1477   }
1478 
1479   PetscCall(PetscMalloc2(m, &rows, N, &cols));
1480   for (i = 0; i < N; i++) cols[i] = i;
1481   for (i = 0; i < m; i++) rows[i] = rstart + i;
1482 
1483   /* PETSc-Elemental interface uses axpy for setting off-processor entries, only ADD_VALUES is allowed */
1484   PetscCall(MatDenseGetArray(A, &v));
1485   PetscCall(MatDenseGetLDA(a->A, &lda));
1486   if (lda == m) PetscCall(MatSetValues(mat_elemental, m, rows, N, cols, v, ADD_VALUES));
1487   else {
1488     for (i = 0; i < N; i++) PetscCall(MatSetValues(mat_elemental, m, rows, 1, &i, v + lda * i, ADD_VALUES));
1489   }
1490   PetscCall(MatAssemblyBegin(mat_elemental, MAT_FINAL_ASSEMBLY));
1491   PetscCall(MatAssemblyEnd(mat_elemental, MAT_FINAL_ASSEMBLY));
1492   PetscCall(MatDenseRestoreArray(A, &v));
1493   PetscCall(PetscFree2(rows, cols));
1494 
1495   if (reuse == MAT_INPLACE_MATRIX) {
1496     PetscCall(MatHeaderReplace(A, &mat_elemental));
1497   } else {
1498     *newmat = mat_elemental;
1499   }
1500   PetscFunctionReturn(PETSC_SUCCESS);
1501 }
1502 #endif
1503 
1504 static PetscErrorCode MatDenseGetColumn_MPIDense(Mat A, PetscInt col, PetscScalar **vals)
1505 {
1506   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
1507 
1508   PetscFunctionBegin;
1509   PetscCall(MatDenseGetColumn(mat->A, col, vals));
1510   PetscFunctionReturn(PETSC_SUCCESS);
1511 }
1512 
1513 static PetscErrorCode MatDenseRestoreColumn_MPIDense(Mat A, PetscScalar **vals)
1514 {
1515   Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
1516 
1517   PetscFunctionBegin;
1518   PetscCall(MatDenseRestoreColumn(mat->A, vals));
1519   PetscFunctionReturn(PETSC_SUCCESS);
1520 }
1521 
1522 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIDense(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
1523 {
1524   Mat_MPIDense *mat;
1525   PetscInt      m, nloc, N;
1526 
1527   PetscFunctionBegin;
1528   PetscCall(MatGetSize(inmat, &m, &N));
1529   PetscCall(MatGetLocalSize(inmat, NULL, &nloc));
1530   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
1531     PetscInt sum;
1532 
1533     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
1534     /* Check sum(n) = N */
1535     PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
1536     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
1537 
1538     PetscCall(MatCreateDense(comm, m, n, PETSC_DETERMINE, N, NULL, outmat));
1539     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
1540   }
1541 
1542   /* numeric phase */
1543   mat = (Mat_MPIDense *)(*outmat)->data;
1544   PetscCall(MatCopy(inmat, mat->A, SAME_NONZERO_PATTERN));
1545   PetscFunctionReturn(PETSC_SUCCESS);
1546 }
1547 
1548 PetscErrorCode MatDenseGetColumnVec_MPIDense(Mat A, PetscInt col, Vec *v)
1549 {
1550   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1551   PetscInt      lda;
1552 
1553   PetscFunctionBegin;
1554   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1555   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1556   if (!a->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &a->cvec));
1557   a->vecinuse = col + 1;
1558   PetscCall(MatDenseGetLDA(a->A, &lda));
1559   PetscCall(MatDenseGetArray(a->A, (PetscScalar **)&a->ptrinuse));
1560   PetscCall(VecPlaceArray(a->cvec, PetscSafePointerPlusOffset(a->ptrinuse, (size_t)col * (size_t)lda)));
1561   *v = a->cvec;
1562   PetscFunctionReturn(PETSC_SUCCESS);
1563 }
1564 
1565 PetscErrorCode MatDenseRestoreColumnVec_MPIDense(Mat A, PetscInt col, Vec *v)
1566 {
1567   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1568 
1569   PetscFunctionBegin;
1570   PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1571   PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1572   VecCheckAssembled(a->cvec);
1573   a->vecinuse = 0;
1574   PetscCall(MatDenseRestoreArray(a->A, (PetscScalar **)&a->ptrinuse));
1575   PetscCall(VecResetArray(a->cvec));
1576   if (v) *v = NULL;
1577   PetscFunctionReturn(PETSC_SUCCESS);
1578 }
1579 
1580 PetscErrorCode MatDenseGetColumnVecRead_MPIDense(Mat A, PetscInt col, Vec *v)
1581 {
1582   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1583   PetscInt      lda;
1584 
1585   PetscFunctionBegin;
1586   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1587   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1588   if (!a->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &a->cvec));
1589   a->vecinuse = col + 1;
1590   PetscCall(MatDenseGetLDA(a->A, &lda));
1591   PetscCall(MatDenseGetArrayRead(a->A, &a->ptrinuse));
1592   PetscCall(VecPlaceArray(a->cvec, PetscSafePointerPlusOffset(a->ptrinuse, (size_t)col * (size_t)lda)));
1593   PetscCall(VecLockReadPush(a->cvec));
1594   *v = a->cvec;
1595   PetscFunctionReturn(PETSC_SUCCESS);
1596 }
1597 
1598 PetscErrorCode MatDenseRestoreColumnVecRead_MPIDense(Mat A, PetscInt col, Vec *v)
1599 {
1600   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1601 
1602   PetscFunctionBegin;
1603   PetscCheck(a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1604   PetscCheck(a->cvec, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing internal column vector");
1605   VecCheckAssembled(a->cvec);
1606   a->vecinuse = 0;
1607   PetscCall(MatDenseRestoreArrayRead(a->A, &a->ptrinuse));
1608   PetscCall(VecLockReadPop(a->cvec));
1609   PetscCall(VecResetArray(a->cvec));
1610   if (v) *v = NULL;
1611   PetscFunctionReturn(PETSC_SUCCESS);
1612 }
1613 
1614 PetscErrorCode MatDenseGetColumnVecWrite_MPIDense(Mat A, PetscInt col, Vec *v)
1615 {
1616   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1617   PetscInt      lda;
1618 
1619   PetscFunctionBegin;
1620   PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1621   PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1622   if (!a->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &a->cvec));
1623   a->vecinuse = col + 1;
1624   PetscCall(MatDenseGetLDA(a->A, &lda));
1625   PetscCall(MatDenseGetArrayWrite(a->A, (PetscScalar **)&a->ptrinuse));
1626   PetscCall(VecPlaceArray(a->cvec, PetscSafePointerPlusOffset(a->ptrinuse, (size_t)col * (size_t)lda)));
1627   *v = a->cvec;
1628   PetscFunctionReturn(PETSC_SUCCESS);
1629 }
1630 
1631 PetscErrorCode MatDenseRestoreColumnVecWrite_MPIDense(Mat A, PetscInt col, Vec *v)
1632 {
1633   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1634 
1635   PetscFunctionBegin;
1636   PetscCheck(a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1637   PetscCheck(a->cvec, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing internal column vector");
1638   VecCheckAssembled(a->cvec);
1639   a->vecinuse = 0;
1640   PetscCall(MatDenseRestoreArrayWrite(a->A, (PetscScalar **)&a->ptrinuse));
1641   PetscCall(VecResetArray(a->cvec));
1642   if (v) *v = NULL;
1643   PetscFunctionReturn(PETSC_SUCCESS);
1644 }
1645 
1646 static PetscErrorCode MatDenseGetSubMatrix_MPIDense(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1647 {
1648   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1649   Mat_MPIDense *c;
1650   MPI_Comm      comm;
1651   PetscInt      prbegin, prend, pcbegin, pcend;
1652 
1653   PetscFunctionBegin;
1654   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1655   PetscCheck(!a->vecinuse, comm, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1656   PetscCheck(!a->matinuse, comm, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1657   prbegin = PetscMax(0, PetscMin(A->rmap->rend, rbegin) - A->rmap->rstart);
1658   prend   = PetscMin(A->rmap->n, PetscMax(0, rend - A->rmap->rstart));
1659   pcbegin = PetscMax(0, PetscMin(A->cmap->rend, cbegin) - A->cmap->rstart);
1660   pcend   = PetscMin(A->cmap->n, PetscMax(0, cend - A->cmap->rstart));
1661   if (!a->cmat) {
1662     PetscCall(MatCreate(comm, &a->cmat));
1663     PetscCall(MatSetType(a->cmat, ((PetscObject)A)->type_name));
1664     if (rend - rbegin == A->rmap->N) PetscCall(PetscLayoutReference(A->rmap, &a->cmat->rmap));
1665     else {
1666       PetscCall(PetscLayoutSetLocalSize(a->cmat->rmap, prend - prbegin));
1667       PetscCall(PetscLayoutSetSize(a->cmat->rmap, rend - rbegin));
1668       PetscCall(PetscLayoutSetUp(a->cmat->rmap));
1669     }
1670     if (cend - cbegin == A->cmap->N) PetscCall(PetscLayoutReference(A->cmap, &a->cmat->cmap));
1671     else {
1672       PetscCall(PetscLayoutSetLocalSize(a->cmat->cmap, pcend - pcbegin));
1673       PetscCall(PetscLayoutSetSize(a->cmat->cmap, cend - cbegin));
1674       PetscCall(PetscLayoutSetUp(a->cmat->cmap));
1675     }
1676     c             = (Mat_MPIDense *)a->cmat->data;
1677     c->sub_rbegin = rbegin;
1678     c->sub_rend   = rend;
1679     c->sub_cbegin = cbegin;
1680     c->sub_cend   = cend;
1681   }
1682   c = (Mat_MPIDense *)a->cmat->data;
1683   if (c->sub_rbegin != rbegin || c->sub_rend != rend) {
1684     PetscCall(PetscLayoutDestroy(&a->cmat->rmap));
1685     PetscCall(PetscLayoutCreate(comm, &a->cmat->rmap));
1686     PetscCall(PetscLayoutSetLocalSize(a->cmat->rmap, prend - prbegin));
1687     PetscCall(PetscLayoutSetSize(a->cmat->rmap, rend - rbegin));
1688     PetscCall(PetscLayoutSetUp(a->cmat->rmap));
1689     c->sub_rbegin = rbegin;
1690     c->sub_rend   = rend;
1691   }
1692   if (c->sub_cbegin != cbegin || c->sub_cend != cend) {
1693     // special optimization: check if all columns are owned by rank 0, in which case no communication is necessary
1694     if ((cend - cbegin != a->cmat->cmap->N) || (A->cmap->range[1] != A->cmap->N)) {
1695       PetscCall(PetscLayoutDestroy(&a->cmat->cmap));
1696       PetscCall(PetscLayoutCreate(comm, &a->cmat->cmap));
1697       PetscCall(PetscLayoutSetLocalSize(a->cmat->cmap, pcend - pcbegin));
1698       PetscCall(PetscLayoutSetSize(a->cmat->cmap, cend - cbegin));
1699       PetscCall(PetscLayoutSetUp(a->cmat->cmap));
1700       PetscCall(VecDestroy(&c->lvec));
1701       PetscCall(PetscSFDestroy(&c->Mvctx));
1702     }
1703     c->sub_cbegin = cbegin;
1704     c->sub_cend   = cend;
1705   }
1706   PetscCheck(!c->A, comm, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1707   PetscCall(MatDenseGetSubMatrix(a->A, prbegin, prend, cbegin, cend, &c->A));
1708 
1709   a->cmat->preallocated = PETSC_TRUE;
1710   a->cmat->assembled    = PETSC_TRUE;
1711 #if defined(PETSC_HAVE_DEVICE)
1712   a->cmat->offloadmask = c->A->offloadmask;
1713 #endif
1714   a->matinuse = cbegin + 1;
1715   *v          = a->cmat;
1716   PetscFunctionReturn(PETSC_SUCCESS);
1717 }
1718 
1719 static PetscErrorCode MatDenseRestoreSubMatrix_MPIDense(Mat A, Mat *v)
1720 {
1721   Mat_MPIDense *a = (Mat_MPIDense *)A->data;
1722   Mat_MPIDense *c;
1723 
1724   PetscFunctionBegin;
1725   PetscCheck(a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1726   PetscCheck(a->cmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing internal matrix");
1727   PetscCheck(*v == a->cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1728   a->matinuse = 0;
1729   c           = (Mat_MPIDense *)a->cmat->data;
1730   PetscCall(MatDenseRestoreSubMatrix(a->A, &c->A));
1731   if (v) *v = NULL;
1732 #if defined(PETSC_HAVE_DEVICE)
1733   A->offloadmask = a->A->offloadmask;
1734 #endif
1735   PetscFunctionReturn(PETSC_SUCCESS);
1736 }
1737 
1738 /*MC
1739    MATMPIDENSE - MATMPIDENSE = "mpidense" - A matrix type to be used for distributed dense matrices.
1740 
1741    Options Database Key:
1742 . -mat_type mpidense - sets the matrix type to `MATMPIDENSE` during a call to `MatSetFromOptions()`
1743 
1744   Level: beginner
1745 
1746 .seealso: [](ch_matrices), `Mat`, `MatCreateDense()`, `MATSEQDENSE`, `MATDENSE`
1747 M*/
1748 PetscErrorCode MatCreate_MPIDense(Mat mat)
1749 {
1750   Mat_MPIDense *a;
1751 
1752   PetscFunctionBegin;
1753   PetscCall(PetscNew(&a));
1754   mat->data   = (void *)a;
1755   mat->ops[0] = MatOps_Values;
1756 
1757   mat->insertmode = NOT_SET_VALUES;
1758 
1759   /* build cache for off array entries formed */
1760   a->donotstash = PETSC_FALSE;
1761 
1762   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)mat), 1, &mat->stash));
1763 
1764   /* stuff used for matrix vector multiply */
1765   a->lvec        = NULL;
1766   a->Mvctx       = NULL;
1767   a->roworiented = PETSC_TRUE;
1768 
1769   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetLDA_C", MatDenseGetLDA_MPIDense));
1770   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseSetLDA_C", MatDenseSetLDA_MPIDense));
1771   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArray_C", MatDenseGetArray_MPIDense));
1772   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArray_C", MatDenseRestoreArray_MPIDense));
1773   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_MPIDense));
1774   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayRead_C", MatDenseRestoreArrayRead_MPIDense));
1775   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_MPIDense));
1776   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayWrite_C", MatDenseRestoreArrayWrite_MPIDense));
1777   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDensePlaceArray_C", MatDensePlaceArray_MPIDense));
1778   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseResetArray_C", MatDenseResetArray_MPIDense));
1779   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseReplaceArray_C", MatDenseReplaceArray_MPIDense));
1780   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_MPIDense));
1781   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_MPIDense));
1782   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_MPIDense));
1783   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_MPIDense));
1784   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_MPIDense));
1785   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_MPIDense));
1786   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_MPIDense));
1787   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_MPIDense));
1788   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
1789   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpiaij_C", MatConvert_MPIDense_MPIAIJ));
1790 #if defined(PETSC_HAVE_ELEMENTAL)
1791   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_elemental_C", MatConvert_MPIDense_Elemental));
1792 #endif
1793 #if defined(PETSC_HAVE_SCALAPACK)
1794   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_scalapack_C", MatConvert_Dense_ScaLAPACK));
1795 #endif
1796 #if defined(PETSC_HAVE_CUDA)
1797   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensecuda_C", MatConvert_MPIDense_MPIDenseCUDA));
1798 #endif
1799   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIDenseSetPreallocation_C", MatMPIDenseSetPreallocation_MPIDense));
1800   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidense_C", MatProductSetFromOptions_MPIAIJ_MPIDense));
1801   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaij_C", MatProductSetFromOptions_MPIDense_MPIAIJ));
1802 #if defined(PETSC_HAVE_CUDA)
1803   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijcusparse_mpidense_C", MatProductSetFromOptions_MPIAIJ_MPIDense));
1804   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijcusparse_C", MatProductSetFromOptions_MPIDense_MPIAIJ));
1805 #endif
1806 #if defined(PETSC_HAVE_HIP)
1807   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensehip_C", MatConvert_MPIDense_MPIDenseHIP));
1808   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijhipsparse_mpidense_C", MatProductSetFromOptions_MPIAIJ_MPIDense));
1809   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijhipsparse_C", MatProductSetFromOptions_MPIDense_MPIAIJ));
1810 #endif
1811   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumn_C", MatDenseGetColumn_MPIDense));
1812   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumn_C", MatDenseRestoreColumn_MPIDense));
1813   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultColumnRange_C", MatMultColumnRange_MPIDense));
1814   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultAddColumnRange_C", MatMultAddColumnRange_MPIDense));
1815   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_MPIDense));
1816   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_MPIDense));
1817   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATMPIDENSE));
1818   PetscFunctionReturn(PETSC_SUCCESS);
1819 }
1820 
1821 /*MC
1822    MATDENSE - MATDENSE = "dense" - A matrix type to be used for dense matrices.
1823 
1824    This matrix type is identical to `MATSEQDENSE` when constructed with a single process communicator,
1825    and `MATMPIDENSE` otherwise.
1826 
1827    Options Database Key:
1828 . -mat_type dense - sets the matrix type to `MATDENSE` during a call to `MatSetFromOptions()`
1829 
1830   Level: beginner
1831 
1832 .seealso: [](ch_matrices), `Mat`, `MATSEQDENSE`, `MATMPIDENSE`, `MATDENSECUDA`, `MATDENSEHIP`
1833 M*/
1834 
1835 /*@
1836   MatMPIDenseSetPreallocation - Sets the array used to store the matrix entries
1837 
1838   Collective
1839 
1840   Input Parameters:
1841 + B    - the matrix
1842 - data - optional location of matrix data.  Set to `NULL` for PETSc
1843          to control all matrix memory allocation.
1844 
1845   Level: intermediate
1846 
1847   Notes:
1848   The dense format is fully compatible with standard Fortran
1849   storage by columns.
1850 
1851   The data input variable is intended primarily for Fortran programmers
1852   who wish to allocate their own matrix memory space.  Most users should
1853   set `data` to `NULL`.
1854 
1855 .seealso: [](ch_matrices), `Mat`, `MATMPIDENSE`, `MatCreate()`, `MatCreateSeqDense()`, `MatSetValues()`
1856 @*/
1857 PetscErrorCode MatMPIDenseSetPreallocation(Mat B, PetscScalar *data)
1858 {
1859   PetscFunctionBegin;
1860   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
1861   PetscTryMethod(B, "MatMPIDenseSetPreallocation_C", (Mat, PetscScalar *), (B, data));
1862   PetscFunctionReturn(PETSC_SUCCESS);
1863 }
1864 
1865 /*@
1866   MatDensePlaceArray - Allows one to replace the array in a `MATDENSE` matrix with an
1867   array provided by the user. This is useful to avoid copying an array
1868   into a matrix
1869 
1870   Not Collective
1871 
1872   Input Parameters:
1873 + mat   - the matrix
1874 - array - the array in column major order
1875 
1876   Level: developer
1877 
1878   Note:
1879   Adding `const` to `array` was an oversight, see notes in `VecPlaceArray()`.
1880 
1881   You can return to the original array with a call to `MatDenseResetArray()`. The user is responsible for freeing this array; it will not be
1882   freed when the matrix is destroyed.
1883 
1884 .seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MatDenseGetArray()`, `MatDenseResetArray()`, `VecPlaceArray()`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecResetArray()`,
1885           `MatDenseReplaceArray()`
1886 @*/
1887 PetscErrorCode MatDensePlaceArray(Mat mat, const PetscScalar *array)
1888 {
1889   PetscFunctionBegin;
1890   PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
1891   PetscUseMethod(mat, "MatDensePlaceArray_C", (Mat, const PetscScalar *), (mat, array));
1892   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
1893 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
1894   mat->offloadmask = PETSC_OFFLOAD_CPU;
1895 #endif
1896   PetscFunctionReturn(PETSC_SUCCESS);
1897 }
1898 
1899 /*@
1900   MatDenseResetArray - Resets the matrix array to that it previously had before the call to `MatDensePlaceArray()`
1901 
1902   Not Collective
1903 
1904   Input Parameter:
1905 . mat - the matrix
1906 
1907   Level: developer
1908 
1909   Note:
1910   You can only call this after a call to `MatDensePlaceArray()`
1911 
1912 .seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MatDenseGetArray()`, `MatDensePlaceArray()`, `VecPlaceArray()`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecResetArray()`
1913 @*/
1914 PetscErrorCode MatDenseResetArray(Mat mat)
1915 {
1916   PetscFunctionBegin;
1917   PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
1918   PetscUseMethod(mat, "MatDenseResetArray_C", (Mat), (mat));
1919   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
1920   PetscFunctionReturn(PETSC_SUCCESS);
1921 }
1922 
1923 /*@
1924   MatDenseReplaceArray - Allows one to replace the array in a dense matrix with an
1925   array provided by the user. This is useful to avoid copying an array
1926   into a matrix
1927 
1928   Not Collective
1929 
1930   Input Parameters:
1931 + mat   - the matrix
1932 - array - the array in column major order
1933 
1934   Level: developer
1935 
1936   Note:
1937   Adding `const` to `array` was an oversight, see notes in `VecPlaceArray()`.
1938 
1939   The memory passed in MUST be obtained with `PetscMalloc()` and CANNOT be
1940   freed by the user. It will be freed when the matrix is destroyed.
1941 
1942 .seealso: [](ch_matrices), `Mat`, `MatDensePlaceArray()`, `MatDenseGetArray()`, `VecReplaceArray()`
1943 @*/
1944 PetscErrorCode MatDenseReplaceArray(Mat mat, const PetscScalar *array)
1945 {
1946   PetscFunctionBegin;
1947   PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
1948   PetscUseMethod(mat, "MatDenseReplaceArray_C", (Mat, const PetscScalar *), (mat, array));
1949   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
1950 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
1951   mat->offloadmask = PETSC_OFFLOAD_CPU;
1952 #endif
1953   PetscFunctionReturn(PETSC_SUCCESS);
1954 }
1955 
1956 /*@
1957   MatCreateDense - Creates a matrix in `MATDENSE` format.
1958 
1959   Collective
1960 
1961   Input Parameters:
1962 + comm - MPI communicator
1963 . m    - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1964 . n    - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1965 . M    - number of global rows (or `PETSC_DECIDE` to have calculated if `m` is given)
1966 . N    - number of global columns (or `PETSC_DECIDE` to have calculated if `n` is given)
1967 - data - optional location of matrix data.  Set data to `NULL` (`PETSC_NULL_SCALAR` for Fortran users) for PETSc
1968    to control all matrix memory allocation.
1969 
1970   Output Parameter:
1971 . A - the matrix
1972 
1973   Level: intermediate
1974 
1975   Notes:
1976   The dense format is fully compatible with standard Fortran
1977   storage by columns.
1978 
1979   Although local portions of the matrix are stored in column-major
1980   order, the matrix is partitioned across MPI ranks by row.
1981 
1982   The data input variable is intended primarily for Fortran programmers
1983   who wish to allocate their own matrix memory space.  Most users should
1984   set `data` to `NULL` (`PETSC_NULL_SCALAR` for Fortran users).
1985 
1986   The user MUST specify either the local or global matrix dimensions
1987   (possibly both).
1988 
1989 .seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MatCreate()`, `MatCreateSeqDense()`, `MatSetValues()`
1990 @*/
1991 PetscErrorCode MatCreateDense(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscScalar data[], Mat *A)
1992 {
1993   PetscFunctionBegin;
1994   PetscCall(MatCreate(comm, A));
1995   PetscCall(MatSetSizes(*A, m, n, M, N));
1996   PetscCall(MatSetType(*A, MATDENSE));
1997   PetscCall(MatSeqDenseSetPreallocation(*A, data));
1998   PetscCall(MatMPIDenseSetPreallocation(*A, data));
1999   PetscFunctionReturn(PETSC_SUCCESS);
2000 }
2001 
2002 static PetscErrorCode MatDuplicate_MPIDense(Mat A, MatDuplicateOption cpvalues, Mat *newmat)
2003 {
2004   Mat           mat;
2005   Mat_MPIDense *a, *oldmat = (Mat_MPIDense *)A->data;
2006 
2007   PetscFunctionBegin;
2008   *newmat = NULL;
2009   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &mat));
2010   PetscCall(MatSetSizes(mat, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
2011   PetscCall(MatSetType(mat, ((PetscObject)A)->type_name));
2012   a = (Mat_MPIDense *)mat->data;
2013 
2014   mat->factortype   = A->factortype;
2015   mat->assembled    = PETSC_TRUE;
2016   mat->preallocated = PETSC_TRUE;
2017 
2018   mat->insertmode = NOT_SET_VALUES;
2019   a->donotstash   = oldmat->donotstash;
2020 
2021   PetscCall(PetscLayoutReference(A->rmap, &mat->rmap));
2022   PetscCall(PetscLayoutReference(A->cmap, &mat->cmap));
2023 
2024   PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
2025 
2026   *newmat = mat;
2027   PetscFunctionReturn(PETSC_SUCCESS);
2028 }
2029 
2030 static PetscErrorCode MatLoad_MPIDense(Mat newMat, PetscViewer viewer)
2031 {
2032   PetscBool isbinary;
2033 #if defined(PETSC_HAVE_HDF5)
2034   PetscBool ishdf5;
2035 #endif
2036 
2037   PetscFunctionBegin;
2038   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
2039   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
2040   /* force binary viewer to load .info file if it has not yet done so */
2041   PetscCall(PetscViewerSetUp(viewer));
2042   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
2043 #if defined(PETSC_HAVE_HDF5)
2044   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
2045 #endif
2046   if (isbinary) {
2047     PetscCall(MatLoad_Dense_Binary(newMat, viewer));
2048 #if defined(PETSC_HAVE_HDF5)
2049   } else if (ishdf5) {
2050     PetscCall(MatLoad_Dense_HDF5(newMat, viewer));
2051 #endif
2052   } else SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
2053   PetscFunctionReturn(PETSC_SUCCESS);
2054 }
2055 
2056 static PetscErrorCode MatEqual_MPIDense(Mat A, Mat B, PetscBool *flag)
2057 {
2058   Mat_MPIDense *matB = (Mat_MPIDense *)B->data, *matA = (Mat_MPIDense *)A->data;
2059   Mat           a, b;
2060 
2061   PetscFunctionBegin;
2062   a = matA->A;
2063   b = matB->A;
2064   PetscCall(MatEqual(a, b, flag));
2065   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, flag, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2066   PetscFunctionReturn(PETSC_SUCCESS);
2067 }
2068 
2069 static PetscErrorCode MatDestroy_MatTransMatMult_MPIDense_MPIDense(void *data)
2070 {
2071   Mat_TransMatMultDense *atb = (Mat_TransMatMultDense *)data;
2072 
2073   PetscFunctionBegin;
2074   PetscCall(PetscFree2(atb->sendbuf, atb->recvcounts));
2075   PetscCall(MatDestroy(&atb->atb));
2076   PetscCall(PetscFree(atb));
2077   PetscFunctionReturn(PETSC_SUCCESS);
2078 }
2079 
2080 static PetscErrorCode MatDestroy_MatMatTransMult_MPIDense_MPIDense(void *data)
2081 {
2082   Mat_MatTransMultDense *abt = (Mat_MatTransMultDense *)data;
2083 
2084   PetscFunctionBegin;
2085   PetscCall(PetscFree2(abt->buf[0], abt->buf[1]));
2086   PetscCall(PetscFree2(abt->recvcounts, abt->recvdispls));
2087   PetscCall(PetscFree(abt));
2088   PetscFunctionReturn(PETSC_SUCCESS);
2089 }
2090 
2091 static PetscErrorCode MatTransposeMatMultNumeric_MPIDense_MPIDense(Mat A, Mat B, Mat C)
2092 {
2093   Mat_MPIDense          *a = (Mat_MPIDense *)A->data, *b = (Mat_MPIDense *)B->data, *c = (Mat_MPIDense *)C->data;
2094   Mat_TransMatMultDense *atb;
2095   MPI_Comm               comm;
2096   PetscMPIInt            size, *recvcounts;
2097   PetscScalar           *carray, *sendbuf;
2098   const PetscScalar     *atbarray;
2099   PetscInt               i, cN = C->cmap->N, proc, k, j, lda;
2100   const PetscInt        *ranges;
2101 
2102   PetscFunctionBegin;
2103   MatCheckProduct(C, 3);
2104   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
2105   atb        = (Mat_TransMatMultDense *)C->product->data;
2106   recvcounts = atb->recvcounts;
2107   sendbuf    = atb->sendbuf;
2108 
2109   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2110   PetscCallMPI(MPI_Comm_size(comm, &size));
2111 
2112   /* compute atbarray = aseq^T * bseq */
2113   PetscCall(MatTransposeMatMult(a->A, b->A, atb->atb ? MAT_REUSE_MATRIX : MAT_INITIAL_MATRIX, PETSC_DETERMINE, &atb->atb));
2114 
2115   PetscCall(MatGetOwnershipRanges(C, &ranges));
2116 
2117   if (ranges[1] == C->rmap->N) {
2118     /* all of the values are being reduced to rank 0: optimize this case to use MPI_Reduce and GPU aware MPI if available */
2119     PetscInt           atb_lda, c_lda;
2120     Mat                atb_local = atb->atb;
2121     Mat                atb_alloc = NULL;
2122     Mat                c_local   = c->A;
2123     Mat                c_alloc   = NULL;
2124     PetscMemType       atb_memtype, c_memtype;
2125     const PetscScalar *atb_array = NULL;
2126     MPI_Datatype       vector_type;
2127     PetscScalar       *c_array = NULL;
2128     PetscMPIInt        rank;
2129 
2130     PetscCallMPI(MPI_Comm_rank(comm, &rank));
2131 
2132     PetscCall(MatDenseGetLDA(atb_local, &atb_lda));
2133     if (atb_lda != C->rmap->N) {
2134       // copy atb to a matrix that will have lda == the number of rows
2135       PetscCall(MatDuplicate(atb_local, MAT_DO_NOT_COPY_VALUES, &atb_alloc));
2136       PetscCall(MatCopy(atb_local, atb_alloc, DIFFERENT_NONZERO_PATTERN));
2137       atb_local = atb_alloc;
2138     }
2139 
2140     if (rank == 0) {
2141       PetscCall(MatDenseGetLDA(c_local, &c_lda));
2142       if (c_lda != C->rmap->N) {
2143         // copy c to a matrix that will have lda == the number of rows
2144         PetscCall(MatDuplicate(c_local, MAT_DO_NOT_COPY_VALUES, &c_alloc));
2145         c_local = c_alloc;
2146       }
2147       PetscCall(MatZeroEntries(c_local));
2148     }
2149     /* atb_local and c_local have nrows = lda = A->cmap->N and ncols =
2150      * B->cmap->N: use the a->Mvctx to use the best reduction method */
2151     if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
2152     vector_type = MPIU_SCALAR;
2153     if (B->cmap->N > 1) {
2154       PetscCallMPI(MPI_Type_contiguous(B->cmap->N, MPIU_SCALAR, &vector_type));
2155       PetscCallMPI(MPI_Type_commit(&vector_type));
2156     }
2157     PetscCall(MatDenseGetArrayReadAndMemType(atb_local, &atb_array, &atb_memtype));
2158     PetscCall(MatDenseGetArrayWriteAndMemType(c_local, &c_array, &c_memtype));
2159     PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, vector_type, atb_memtype, atb_array, c_memtype, c_array, MPIU_SUM));
2160     PetscCall(PetscSFReduceEnd(a->Mvctx, vector_type, atb_array, c_array, MPIU_SUM));
2161     PetscCall(MatDenseRestoreArrayWriteAndMemType(c_local, &c_array));
2162     PetscCall(MatDenseRestoreArrayReadAndMemType(atb_local, &atb_array));
2163     if (rank == 0 && c_local != c->A) PetscCall(MatCopy(c_local, c->A, DIFFERENT_NONZERO_PATTERN));
2164     if (B->cmap->N > 1) PetscCallMPI(MPI_Type_free(&vector_type));
2165     PetscCall(MatDestroy(&atb_alloc));
2166     PetscCall(MatDestroy(&c_alloc));
2167     PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
2168     PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2169     PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
2170     PetscFunctionReturn(PETSC_SUCCESS);
2171   }
2172 
2173   /* arrange atbarray into sendbuf */
2174   PetscCall(MatDenseGetArrayRead(atb->atb, &atbarray));
2175   PetscCall(MatDenseGetLDA(atb->atb, &lda));
2176   for (proc = 0, k = 0; proc < size; proc++) {
2177     for (j = 0; j < cN; j++) {
2178       for (i = ranges[proc]; i < ranges[proc + 1]; i++) sendbuf[k++] = atbarray[i + j * lda];
2179     }
2180   }
2181   PetscCall(MatDenseRestoreArrayRead(atb->atb, &atbarray));
2182 
2183   /* sum all atbarray to local values of C */
2184   PetscCall(MatDenseGetArrayWrite(c->A, &carray));
2185   PetscCallMPI(MPI_Reduce_scatter(sendbuf, carray, recvcounts, MPIU_SCALAR, MPIU_SUM, comm));
2186   PetscCall(MatDenseRestoreArrayWrite(c->A, &carray));
2187   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
2188   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2189   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
2190   PetscFunctionReturn(PETSC_SUCCESS);
2191 }
2192 
2193 static PetscErrorCode MatTransposeMatMultSymbolic_MPIDense_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
2194 {
2195   MPI_Comm               comm;
2196   PetscMPIInt            size;
2197   PetscInt               cm = A->cmap->n, cM, cN = B->cmap->N;
2198   Mat_TransMatMultDense *atb;
2199   PetscBool              cisdense = PETSC_FALSE;
2200   const PetscInt        *ranges;
2201 
2202   PetscFunctionBegin;
2203   MatCheckProduct(C, 4);
2204   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
2205   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2206   if (A->rmap->rstart != B->rmap->rstart || A->rmap->rend != B->rmap->rend) {
2207     SETERRQ(comm, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != B (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend);
2208   }
2209 
2210   /* create matrix product C */
2211   PetscCall(MatSetSizes(C, cm, B->cmap->n, A->cmap->N, B->cmap->N));
2212 #if defined(PETSC_HAVE_CUDA)
2213   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, ""));
2214 #endif
2215 #if defined(PETSC_HAVE_HIP)
2216   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSEHIP, ""));
2217 #endif
2218   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
2219   PetscCall(MatSetUp(C));
2220 
2221   /* create data structure for reuse C */
2222   PetscCallMPI(MPI_Comm_size(comm, &size));
2223   PetscCall(PetscNew(&atb));
2224   cM = C->rmap->N;
2225   PetscCall(PetscMalloc2(cM * cN, &atb->sendbuf, size, &atb->recvcounts));
2226   PetscCall(MatGetOwnershipRanges(C, &ranges));
2227   for (PetscMPIInt i = 0; i < size; i++) PetscCall(PetscMPIIntCast((ranges[i + 1] - ranges[i]) * cN, &atb->recvcounts[i]));
2228   C->product->data    = atb;
2229   C->product->destroy = MatDestroy_MatTransMatMult_MPIDense_MPIDense;
2230   PetscFunctionReturn(PETSC_SUCCESS);
2231 }
2232 
2233 static PetscErrorCode MatMatTransposeMultSymbolic_MPIDense_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
2234 {
2235   MPI_Comm               comm;
2236   PetscMPIInt            i, size;
2237   PetscInt               maxRows, bufsiz;
2238   PetscMPIInt            tag;
2239   PetscInt               alg;
2240   Mat_MatTransMultDense *abt;
2241   Mat_Product           *product = C->product;
2242   PetscBool              flg;
2243 
2244   PetscFunctionBegin;
2245   MatCheckProduct(C, 4);
2246   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
2247   /* check local size of A and B */
2248   PetscCheck(A->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local column dimensions are incompatible, A (%" PetscInt_FMT ") != B (%" PetscInt_FMT ")", A->cmap->n, B->cmap->n);
2249 
2250   PetscCall(PetscStrcmp(product->alg, "allgatherv", &flg));
2251   alg = flg ? 0 : 1;
2252 
2253   /* setup matrix product C */
2254   PetscCall(MatSetSizes(C, A->rmap->n, B->rmap->n, A->rmap->N, B->rmap->N));
2255   PetscCall(MatSetType(C, MATMPIDENSE));
2256   PetscCall(MatSetUp(C));
2257   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag));
2258 
2259   /* create data structure for reuse C */
2260   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2261   PetscCallMPI(MPI_Comm_size(comm, &size));
2262   PetscCall(PetscNew(&abt));
2263   abt->tag = tag;
2264   abt->alg = alg;
2265   switch (alg) {
2266   case 1: /* alg: "cyclic" */
2267     for (maxRows = 0, i = 0; i < size; i++) maxRows = PetscMax(maxRows, B->rmap->range[i + 1] - B->rmap->range[i]);
2268     bufsiz = A->cmap->N * maxRows;
2269     PetscCall(PetscMalloc2(bufsiz, &abt->buf[0], bufsiz, &abt->buf[1]));
2270     break;
2271   default: /* alg: "allgatherv" */
2272     PetscCall(PetscMalloc2(B->rmap->n * B->cmap->N, &abt->buf[0], B->rmap->N * B->cmap->N, &abt->buf[1]));
2273     PetscCall(PetscMalloc2(size, &abt->recvcounts, size + 1, &abt->recvdispls));
2274     for (i = 0; i <= size; i++) PetscCall(PetscMPIIntCast(B->rmap->range[i] * A->cmap->N, &abt->recvdispls[i]));
2275     for (i = 0; i < size; i++) PetscCall(PetscMPIIntCast(abt->recvdispls[i + 1] - abt->recvdispls[i], &abt->recvcounts[i]));
2276     break;
2277   }
2278 
2279   C->product->data                = abt;
2280   C->product->destroy             = MatDestroy_MatMatTransMult_MPIDense_MPIDense;
2281   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_MPIDense_MPIDense;
2282   PetscFunctionReturn(PETSC_SUCCESS);
2283 }
2284 
2285 static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense_Cyclic(Mat A, Mat B, Mat C)
2286 {
2287   Mat_MPIDense          *a = (Mat_MPIDense *)A->data, *b = (Mat_MPIDense *)B->data, *c = (Mat_MPIDense *)C->data;
2288   Mat_MatTransMultDense *abt;
2289   MPI_Comm               comm;
2290   PetscMPIInt            rank, size, sendto, recvfrom, recvisfrom;
2291   PetscScalar           *sendbuf, *recvbuf = NULL, *cv;
2292   PetscInt               i, cK             = A->cmap->N, sendsiz, recvsiz, k, j, bn;
2293   PetscScalar            _DOne = 1.0, _DZero = 0.0;
2294   const PetscScalar     *av, *bv;
2295   PetscBLASInt           cm, cn, ck, alda, blda = 0, clda;
2296   MPI_Request            reqs[2];
2297   const PetscInt        *ranges;
2298 
2299   PetscFunctionBegin;
2300   MatCheckProduct(C, 3);
2301   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
2302   abt = (Mat_MatTransMultDense *)C->product->data;
2303   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2304   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2305   PetscCallMPI(MPI_Comm_size(comm, &size));
2306   PetscCall(MatDenseGetArrayRead(a->A, &av));
2307   PetscCall(MatDenseGetArrayRead(b->A, &bv));
2308   PetscCall(MatDenseGetArrayWrite(c->A, &cv));
2309   PetscCall(MatDenseGetLDA(a->A, &i));
2310   PetscCall(PetscBLASIntCast(i, &alda));
2311   PetscCall(MatDenseGetLDA(b->A, &i));
2312   PetscCall(PetscBLASIntCast(i, &blda));
2313   PetscCall(MatDenseGetLDA(c->A, &i));
2314   PetscCall(PetscBLASIntCast(i, &clda));
2315   PetscCall(MatGetOwnershipRanges(B, &ranges));
2316   bn = B->rmap->n;
2317   if (blda == bn) {
2318     sendbuf = (PetscScalar *)bv;
2319   } else {
2320     sendbuf = abt->buf[0];
2321     for (k = 0, i = 0; i < cK; i++) {
2322       for (j = 0; j < bn; j++, k++) sendbuf[k] = bv[i * blda + j];
2323     }
2324   }
2325   if (size > 1) {
2326     sendto   = (rank + size - 1) % size;
2327     recvfrom = (rank + size + 1) % size;
2328   } else {
2329     sendto = recvfrom = 0;
2330   }
2331   PetscCall(PetscBLASIntCast(cK, &ck));
2332   PetscCall(PetscBLASIntCast(c->A->rmap->n, &cm));
2333   recvisfrom = rank;
2334   for (i = 0; i < size; i++) {
2335     /* we have finished receiving in sending, bufs can be read/modified */
2336     PetscMPIInt nextrecvisfrom = (recvisfrom + 1) % size; /* which process the next recvbuf will originate on */
2337     PetscInt    nextbn         = ranges[nextrecvisfrom + 1] - ranges[nextrecvisfrom];
2338 
2339     if (nextrecvisfrom != rank) {
2340       /* start the cyclic sends from sendbuf, to recvbuf (which will switch to sendbuf) */
2341       sendsiz = cK * bn;
2342       recvsiz = cK * nextbn;
2343       recvbuf = (i & 1) ? abt->buf[0] : abt->buf[1];
2344       PetscCallMPI(MPIU_Isend(sendbuf, sendsiz, MPIU_SCALAR, sendto, abt->tag, comm, &reqs[0]));
2345       PetscCallMPI(MPIU_Irecv(recvbuf, recvsiz, MPIU_SCALAR, recvfrom, abt->tag, comm, &reqs[1]));
2346     }
2347 
2348     /* local aseq * sendbuf^T */
2349     PetscCall(PetscBLASIntCast(ranges[recvisfrom + 1] - ranges[recvisfrom], &cn));
2350     if (cm && cn && ck) PetscCallBLAS("BLASgemm", BLASgemm_("N", "T", &cm, &cn, &ck, &_DOne, av, &alda, sendbuf, &cn, &_DZero, cv + clda * ranges[recvisfrom], &clda));
2351 
2352     if (nextrecvisfrom != rank) {
2353       /* wait for the sends and receives to complete, swap sendbuf and recvbuf */
2354       PetscCallMPI(MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE));
2355     }
2356     bn         = nextbn;
2357     recvisfrom = nextrecvisfrom;
2358     sendbuf    = recvbuf;
2359   }
2360   PetscCall(MatDenseRestoreArrayRead(a->A, &av));
2361   PetscCall(MatDenseRestoreArrayRead(b->A, &bv));
2362   PetscCall(MatDenseRestoreArrayWrite(c->A, &cv));
2363   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
2364   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2365   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
2366   PetscFunctionReturn(PETSC_SUCCESS);
2367 }
2368 
2369 static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense_Allgatherv(Mat A, Mat B, Mat C)
2370 {
2371   Mat_MPIDense          *a = (Mat_MPIDense *)A->data, *b = (Mat_MPIDense *)B->data, *c = (Mat_MPIDense *)C->data;
2372   Mat_MatTransMultDense *abt;
2373   MPI_Comm               comm;
2374   PetscMPIInt            size, ibn;
2375   PetscScalar           *cv, *sendbuf, *recvbuf;
2376   const PetscScalar     *av, *bv;
2377   PetscInt               blda, i, cK = A->cmap->N, k, j, bn;
2378   PetscScalar            _DOne = 1.0, _DZero = 0.0;
2379   PetscBLASInt           cm, cn, ck, alda, clda;
2380 
2381   PetscFunctionBegin;
2382   MatCheckProduct(C, 3);
2383   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
2384   abt = (Mat_MatTransMultDense *)C->product->data;
2385   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2386   PetscCallMPI(MPI_Comm_size(comm, &size));
2387   PetscCall(MatDenseGetArrayRead(a->A, &av));
2388   PetscCall(MatDenseGetArrayRead(b->A, &bv));
2389   PetscCall(MatDenseGetArrayWrite(c->A, &cv));
2390   PetscCall(MatDenseGetLDA(a->A, &i));
2391   PetscCall(PetscBLASIntCast(i, &alda));
2392   PetscCall(MatDenseGetLDA(b->A, &blda));
2393   PetscCall(MatDenseGetLDA(c->A, &i));
2394   PetscCall(PetscBLASIntCast(i, &clda));
2395   /* copy transpose of B into buf[0] */
2396   bn      = B->rmap->n;
2397   sendbuf = abt->buf[0];
2398   recvbuf = abt->buf[1];
2399   for (k = 0, j = 0; j < bn; j++) {
2400     for (i = 0; i < cK; i++, k++) sendbuf[k] = bv[i * blda + j];
2401   }
2402   PetscCall(MatDenseRestoreArrayRead(b->A, &bv));
2403   PetscCall(PetscMPIIntCast(bn * cK, &ibn));
2404   PetscCallMPI(MPI_Allgatherv(sendbuf, ibn, MPIU_SCALAR, recvbuf, abt->recvcounts, abt->recvdispls, MPIU_SCALAR, comm));
2405   PetscCall(PetscBLASIntCast(cK, &ck));
2406   PetscCall(PetscBLASIntCast(c->A->rmap->n, &cm));
2407   PetscCall(PetscBLASIntCast(c->A->cmap->n, &cn));
2408   if (cm && cn && ck) PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &cm, &cn, &ck, &_DOne, av, &alda, recvbuf, &ck, &_DZero, cv, &clda));
2409   PetscCall(MatDenseRestoreArrayRead(a->A, &av));
2410   PetscCall(MatDenseRestoreArrayRead(b->A, &bv));
2411   PetscCall(MatDenseRestoreArrayWrite(c->A, &cv));
2412   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
2413   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2414   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
2415   PetscFunctionReturn(PETSC_SUCCESS);
2416 }
2417 
2418 static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense(Mat A, Mat B, Mat C)
2419 {
2420   Mat_MatTransMultDense *abt;
2421 
2422   PetscFunctionBegin;
2423   MatCheckProduct(C, 3);
2424   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
2425   abt = (Mat_MatTransMultDense *)C->product->data;
2426   switch (abt->alg) {
2427   case 1:
2428     PetscCall(MatMatTransposeMultNumeric_MPIDense_MPIDense_Cyclic(A, B, C));
2429     break;
2430   default:
2431     PetscCall(MatMatTransposeMultNumeric_MPIDense_MPIDense_Allgatherv(A, B, C));
2432     break;
2433   }
2434   PetscFunctionReturn(PETSC_SUCCESS);
2435 }
2436 
2437 static PetscErrorCode MatDestroy_MatMatMult_MPIDense_MPIDense(void *data)
2438 {
2439   Mat_MatMultDense *ab = (Mat_MatMultDense *)data;
2440 
2441   PetscFunctionBegin;
2442   PetscCall(MatDestroy(&ab->Ce));
2443   PetscCall(MatDestroy(&ab->Ae));
2444   PetscCall(MatDestroy(&ab->Be));
2445   PetscCall(PetscFree(ab));
2446   PetscFunctionReturn(PETSC_SUCCESS);
2447 }
2448 
2449 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIDense(Mat A, Mat B, Mat C)
2450 {
2451   Mat_MatMultDense *ab;
2452   Mat_MPIDense     *mdn = (Mat_MPIDense *)A->data;
2453   Mat_MPIDense     *b   = (Mat_MPIDense *)B->data;
2454 
2455   PetscFunctionBegin;
2456   MatCheckProduct(C, 3);
2457   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Missing product data");
2458   ab = (Mat_MatMultDense *)C->product->data;
2459   if (ab->Ae && ab->Ce) {
2460 #if PetscDefined(HAVE_ELEMENTAL)
2461     PetscCall(MatConvert_MPIDense_Elemental(A, MATELEMENTAL, MAT_REUSE_MATRIX, &ab->Ae));
2462     PetscCall(MatConvert_MPIDense_Elemental(B, MATELEMENTAL, MAT_REUSE_MATRIX, &ab->Be));
2463     PetscCall(MatMatMultNumeric_Elemental(ab->Ae, ab->Be, ab->Ce));
2464     PetscCall(MatConvert(ab->Ce, MATMPIDENSE, MAT_REUSE_MATRIX, &C));
2465 #else
2466     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "PETSC_HAVE_ELEMENTAL not defined");
2467 #endif
2468   } else {
2469     MPI_Comm           comm;
2470     const PetscScalar *read;
2471     PetscScalar       *write;
2472     PetscInt           lda;
2473     const PetscInt    *ranges;
2474     PetscMPIInt        size;
2475 
2476     if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A)); /* cannot be done during the symbolic phase because of possible calls to MatProductReplaceMats() */
2477     comm = PetscObjectComm((PetscObject)B);
2478     PetscCallMPI(MPI_Comm_size(comm, &size));
2479     PetscCall(PetscLayoutGetRanges(B->rmap, &ranges));
2480     if (ranges[1] == ranges[size]) {
2481       // optimize for the case where the B matrix is broadcast from rank 0
2482       PetscInt           b_lda, be_lda;
2483       Mat                b_local  = b->A;
2484       Mat                b_alloc  = NULL;
2485       Mat                be_local = ab->Be;
2486       Mat                be_alloc = NULL;
2487       PetscMemType       b_memtype, be_memtype;
2488       const PetscScalar *b_array = NULL;
2489       MPI_Datatype       vector_type;
2490       PetscScalar       *be_array = NULL;
2491       PetscMPIInt        rank;
2492 
2493       PetscCallMPI(MPI_Comm_rank(comm, &rank));
2494       PetscCall(MatDenseGetLDA(be_local, &be_lda));
2495       if (be_lda != B->rmap->N) {
2496         PetscCall(MatDuplicate(be_local, MAT_DO_NOT_COPY_VALUES, &be_alloc));
2497         be_local = be_alloc;
2498       }
2499 
2500       if (rank == 0) {
2501         PetscCall(MatDenseGetLDA(b_local, &b_lda));
2502         if (b_lda != B->rmap->N) {
2503           PetscCall(MatDuplicate(b_local, MAT_DO_NOT_COPY_VALUES, &b_alloc));
2504           PetscCall(MatCopy(b_local, b_alloc, DIFFERENT_NONZERO_PATTERN));
2505           b_local = b_alloc;
2506         }
2507       }
2508       PetscCallMPI(MPI_Type_contiguous(B->cmap->N, MPIU_SCALAR, &vector_type));
2509       PetscCallMPI(MPI_Type_commit(&vector_type));
2510       PetscCall(MatDenseGetArrayReadAndMemType(b_local, &b_array, &b_memtype));
2511       PetscCall(MatDenseGetArrayWriteAndMemType(be_local, &be_array, &be_memtype));
2512       PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, vector_type, b_memtype, b_array, be_memtype, be_array, MPI_REPLACE));
2513       PetscCall(PetscSFBcastEnd(mdn->Mvctx, vector_type, b_array, be_array, MPI_REPLACE));
2514       PetscCall(MatDenseRestoreArrayWriteAndMemType(be_local, &be_array));
2515       PetscCall(MatDenseRestoreArrayReadAndMemType(b_local, &b_array));
2516       if (be_local != ab->Be) PetscCall(MatCopy(be_local, ab->Be, DIFFERENT_NONZERO_PATTERN));
2517       PetscCallMPI(MPI_Type_free(&vector_type));
2518       PetscCall(MatDestroy(&be_alloc));
2519       PetscCall(MatDestroy(&b_alloc));
2520     } else {
2521       PetscCall(MatDenseGetLDA(B, &lda));
2522       PetscCall(MatDenseGetArrayRead(B, &read));
2523       PetscCall(MatDenseGetArrayWrite(ab->Be, &write));
2524       for (PetscInt i = 0; i < C->cmap->N; ++i) {
2525         PetscCall(PetscSFBcastBegin(mdn->Mvctx, MPIU_SCALAR, read + i * lda, write + i * ab->Be->rmap->n, MPI_REPLACE));
2526         PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, read + i * lda, write + i * ab->Be->rmap->n, MPI_REPLACE));
2527       }
2528       PetscCall(MatDenseRestoreArrayWrite(ab->Be, &write));
2529       PetscCall(MatDenseRestoreArrayRead(B, &read));
2530     }
2531     PetscCall(MatMatMultNumeric_SeqDense_SeqDense(((Mat_MPIDense *)A->data)->A, ab->Be, ((Mat_MPIDense *)C->data)->A));
2532   }
2533   PetscFunctionReturn(PETSC_SUCCESS);
2534 }
2535 
2536 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
2537 {
2538   Mat_Product      *product = C->product;
2539   PetscInt          alg;
2540   Mat_MatMultDense *ab;
2541   PetscBool         flg;
2542 
2543   PetscFunctionBegin;
2544   MatCheckProduct(C, 4);
2545   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
2546   /* check local size of A and B */
2547   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != B (%" PetscInt_FMT ", %" PetscInt_FMT ")",
2548              A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend);
2549 
2550   PetscCall(PetscStrcmp(product->alg, "petsc", &flg));
2551   alg = flg ? 0 : 1;
2552 
2553   /* setup C */
2554   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
2555   PetscCall(MatSetType(C, MATMPIDENSE));
2556   PetscCall(MatSetUp(C));
2557 
2558   /* create data structure for reuse Cdense */
2559   PetscCall(PetscNew(&ab));
2560 
2561   switch (alg) {
2562   case 1: /* alg: "elemental" */
2563 #if PetscDefined(HAVE_ELEMENTAL)
2564     /* create elemental matrices Ae and Be */
2565     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &ab->Ae));
2566     PetscCall(MatSetSizes(ab->Ae, PETSC_DECIDE, PETSC_DECIDE, A->rmap->N, A->cmap->N));
2567     PetscCall(MatSetType(ab->Ae, MATELEMENTAL));
2568     PetscCall(MatSetUp(ab->Ae));
2569     PetscCall(MatSetOption(ab->Ae, MAT_ROW_ORIENTED, PETSC_FALSE));
2570 
2571     PetscCall(MatCreate(PetscObjectComm((PetscObject)B), &ab->Be));
2572     PetscCall(MatSetSizes(ab->Be, PETSC_DECIDE, PETSC_DECIDE, B->rmap->N, B->cmap->N));
2573     PetscCall(MatSetType(ab->Be, MATELEMENTAL));
2574     PetscCall(MatSetUp(ab->Be));
2575     PetscCall(MatSetOption(ab->Be, MAT_ROW_ORIENTED, PETSC_FALSE));
2576 
2577     /* compute symbolic Ce = Ae*Be */
2578     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &ab->Ce));
2579     PetscCall(MatMatMultSymbolic_Elemental(ab->Ae, ab->Be, fill, ab->Ce));
2580 #else
2581     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "PETSC_HAVE_ELEMENTAL not defined");
2582 #endif
2583     break;
2584   default: /* alg: "petsc" */
2585     ab->Ae = NULL;
2586     PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, A->cmap->N, B->cmap->N, NULL, &ab->Be));
2587     ab->Ce = NULL;
2588     break;
2589   }
2590 
2591   C->product->data       = ab;
2592   C->product->destroy    = MatDestroy_MatMatMult_MPIDense_MPIDense;
2593   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIDense;
2594   PetscFunctionReturn(PETSC_SUCCESS);
2595 }
2596 
2597 static PetscErrorCode MatProductSetFromOptions_MPIDense_AB(Mat C)
2598 {
2599   Mat_Product *product     = C->product;
2600   const char  *algTypes[2] = {"petsc", "elemental"};
2601   PetscInt     alg, nalg = PetscDefined(HAVE_ELEMENTAL) ? 2 : 1;
2602   PetscBool    flg = PETSC_FALSE;
2603 
2604   PetscFunctionBegin;
2605   /* Set default algorithm */
2606   alg = 0; /* default is PETSc */
2607   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2608   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2609 
2610   /* Get runtime option */
2611   PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
2612   PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[alg], &alg, &flg));
2613   PetscOptionsEnd();
2614   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2615 
2616   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIDense;
2617   C->ops->productsymbolic = MatProductSymbolic_AB;
2618   PetscFunctionReturn(PETSC_SUCCESS);
2619 }
2620 
2621 static PetscErrorCode MatProductSetFromOptions_MPIDense_AtB(Mat C)
2622 {
2623   Mat_Product *product = C->product;
2624   Mat          A = product->A, B = product->B;
2625 
2626   PetscFunctionBegin;
2627   PetscCheck(A->rmap->rstart == B->rmap->rstart && A->rmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
2628              A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend);
2629   C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_MPIDense_MPIDense;
2630   C->ops->productsymbolic          = MatProductSymbolic_AtB;
2631   PetscFunctionReturn(PETSC_SUCCESS);
2632 }
2633 
2634 static PetscErrorCode MatProductSetFromOptions_MPIDense_ABt(Mat C)
2635 {
2636   Mat_Product *product     = C->product;
2637   const char  *algTypes[2] = {"allgatherv", "cyclic"};
2638   PetscInt     alg, nalg = 2;
2639   PetscBool    flg = PETSC_FALSE;
2640 
2641   PetscFunctionBegin;
2642   /* Set default algorithm */
2643   alg = 0; /* default is allgatherv */
2644   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2645   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2646 
2647   /* Get runtime option */
2648   if (product->api_user) {
2649     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
2650     PetscCall(PetscOptionsEList("-matmattransmult_mpidense_mpidense_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2651     PetscOptionsEnd();
2652   } else {
2653     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
2654     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2655     PetscOptionsEnd();
2656   }
2657   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2658 
2659   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_MPIDense_MPIDense;
2660   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2661   PetscFunctionReturn(PETSC_SUCCESS);
2662 }
2663 
2664 static PetscErrorCode MatProductSetFromOptions_MPIDense(Mat C)
2665 {
2666   Mat_Product *product = C->product;
2667 
2668   PetscFunctionBegin;
2669   switch (product->type) {
2670   case MATPRODUCT_AB:
2671     PetscCall(MatProductSetFromOptions_MPIDense_AB(C));
2672     break;
2673   case MATPRODUCT_AtB:
2674     PetscCall(MatProductSetFromOptions_MPIDense_AtB(C));
2675     break;
2676   case MATPRODUCT_ABt:
2677     PetscCall(MatProductSetFromOptions_MPIDense_ABt(C));
2678     break;
2679   default:
2680     break;
2681   }
2682   PetscFunctionReturn(PETSC_SUCCESS);
2683 }
2684