xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 4e8208cbcbc709572b8abe32f33c78b69c819375)
1d50806bdSBarry Smith /*
22499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
3d50806bdSBarry Smith           C = A * B
4d50806bdSBarry Smith */
5d50806bdSBarry Smith 
6c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
8c6db04a5SJed Brown #include <petscbt.h>
9af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1007475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
117bab7c10SHong Zhang 
MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)12d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
13d71ae5a4SJacob Faibussowitsch {
14df97dc6dSFande Kong   PetscFunctionBegin;
15dbbe0bcdSBarry Smith   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
16dbbe0bcdSBarry Smith   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18df97dc6dSFande Kong }
19df97dc6dSFande Kong 
204222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)21d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
22d71ae5a4SJacob Faibussowitsch {
234222ddf1SHong Zhang   PetscInt    ii;
244222ddf1SHong Zhang   Mat_SeqAIJ *aij;
259f0612e4SBarry Smith   PetscBool   isseqaij, ofree_a, ofree_ij;
265c66b693SKris Buschelman 
275c66b693SKris Buschelman   PetscFunctionBegin;
28aed4548fSBarry Smith   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
299566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(mat, m, n, m, n));
304222ddf1SHong Zhang 
31e4e71118SStefano Zampini   if (!mtype) {
329566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
339566063dSJacob Faibussowitsch     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
34e4e71118SStefano Zampini   } else {
359566063dSJacob Faibussowitsch     PetscCall(MatSetType(mat, mtype));
36e4e71118SStefano Zampini   }
37cbc6b225SStefano Zampini 
3857508eceSPierre Jolivet   aij      = (Mat_SeqAIJ *)mat->data;
39cbc6b225SStefano Zampini   ofree_a  = aij->free_a;
40cbc6b225SStefano Zampini   ofree_ij = aij->free_ij;
41cbc6b225SStefano Zampini   /* changes the free flags */
429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
43cbc6b225SStefano Zampini 
449566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->ilen));
459566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->imax));
469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->imax));
479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->ilen));
48cbc6b225SStefano Zampini   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
49cbc6b225SStefano Zampini     const PetscInt rnz = i[ii + 1] - i[ii];
50cbc6b225SStefano Zampini     aij->nonzerorowcnt += !!rnz;
51cbc6b225SStefano Zampini     aij->rmax     = PetscMax(aij->rmax, rnz);
52cbc6b225SStefano Zampini     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
53cbc6b225SStefano Zampini   }
54cbc6b225SStefano Zampini   aij->maxnz = i[m];
55cbc6b225SStefano Zampini   aij->nz    = i[m];
564222ddf1SHong Zhang 
579f0612e4SBarry Smith   if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a));
589f0612e4SBarry Smith   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j));
599f0612e4SBarry Smith   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i));
609f0612e4SBarry Smith 
614222ddf1SHong Zhang   aij->i       = i;
624222ddf1SHong Zhang   aij->j       = j;
634222ddf1SHong Zhang   aij->a       = a;
644222ddf1SHong Zhang   aij->nonew   = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
654222ddf1SHong Zhang   aij->free_a  = PETSC_FALSE;
664222ddf1SHong Zhang   aij->free_ij = PETSC_FALSE;
679566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
686dc08606SJunchao Zhang   // Always build the diag info when i, j are set
693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
705c66b693SKris Buschelman }
711c24bd37SHong Zhang 
MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)72d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
73d71ae5a4SJacob Faibussowitsch {
744222ddf1SHong Zhang   Mat_Product        *product = C->product;
754222ddf1SHong Zhang   MatProductAlgorithm alg;
764222ddf1SHong Zhang   PetscBool           flg;
774222ddf1SHong Zhang 
784222ddf1SHong Zhang   PetscFunctionBegin;
794222ddf1SHong Zhang   if (product) {
804222ddf1SHong Zhang     alg = product->alg;
814222ddf1SHong Zhang   } else {
824222ddf1SHong Zhang     alg = "sorted";
834222ddf1SHong Zhang   }
844222ddf1SHong Zhang   /* sorted */
859566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "sorted", &flg));
864222ddf1SHong Zhang   if (flg) {
879566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
883ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
894222ddf1SHong Zhang   }
904222ddf1SHong Zhang 
914222ddf1SHong Zhang   /* scalable */
929566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
934222ddf1SHong Zhang   if (flg) {
949566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
953ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
964222ddf1SHong Zhang   }
974222ddf1SHong Zhang 
984222ddf1SHong Zhang   /* scalable_fast */
999566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
1004222ddf1SHong Zhang   if (flg) {
1019566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
1023ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1034222ddf1SHong Zhang   }
1044222ddf1SHong Zhang 
1054222ddf1SHong Zhang   /* heap */
1069566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "heap", &flg));
1074222ddf1SHong Zhang   if (flg) {
1089566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
1093ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1104222ddf1SHong Zhang   }
1114222ddf1SHong Zhang 
1124222ddf1SHong Zhang   /* btheap */
1139566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "btheap", &flg));
1144222ddf1SHong Zhang   if (flg) {
1159566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
1163ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1174222ddf1SHong Zhang   }
1184222ddf1SHong Zhang 
1194222ddf1SHong Zhang   /* llcondensed */
1209566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
1214222ddf1SHong Zhang   if (flg) {
1229566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
1233ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1244222ddf1SHong Zhang   }
1254222ddf1SHong Zhang 
1264222ddf1SHong Zhang   /* rowmerge */
1279566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
1284222ddf1SHong Zhang   if (flg) {
1299566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
1303ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1314222ddf1SHong Zhang   }
1324222ddf1SHong Zhang 
1334222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1349566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
1354222ddf1SHong Zhang   if (flg) {
1369566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
1373ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1384222ddf1SHong Zhang   }
1394222ddf1SHong Zhang #endif
1404222ddf1SHong Zhang 
1414222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1424222ddf1SHong Zhang }
1434222ddf1SHong Zhang 
MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)144d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
145d71ae5a4SJacob Faibussowitsch {
146b561aa9dSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
147971236abSHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
148c1ba5575SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
149b561aa9dSHong Zhang   PetscReal          afill;
150eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
151eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
152fb908031SHong Zhang   PetscBT            lnkbt;
1530298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
154b561aa9dSHong Zhang 
155b561aa9dSHong Zhang   PetscFunctionBegin;
156bd958071SHong Zhang   /* Get ci and cj */
157fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
158fb908031SHong Zhang   /* free space for accumulating nonzero column info */
1599566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
160fb908031SHong Zhang   ci[0] = 0;
161fb908031SHong Zhang 
162fb908031SHong Zhang   /* create and initialize a linked list */
163eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
164c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
165eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
166eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
167eca6b86aSHong Zhang 
1689566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
169fb908031SHong Zhang 
170fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1719566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
1722205254eSKarl Rupp 
173fb908031SHong Zhang   current_space = free_space;
174fb908031SHong Zhang 
175fb908031SHong Zhang   /* Determine ci and cj */
176fb908031SHong Zhang   for (i = 0; i < am; i++) {
177fb908031SHong Zhang     anzi = ai[i + 1] - ai[i];
178fb908031SHong Zhang     aj   = a->j + ai[i];
179fb908031SHong Zhang     for (j = 0; j < anzi; j++) {
180fb908031SHong Zhang       brow = aj[j];
181fb908031SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
182fb908031SHong Zhang       bj   = b->j + bi[brow];
183fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
1849566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
185fb908031SHong Zhang     }
1868c78258cSHong Zhang     /* add possible missing diagonal entry */
18748a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
188fb908031SHong Zhang     cnzi = lnk[0];
189fb908031SHong Zhang 
190fb908031SHong Zhang     /* If free space is not available, make more free space */
191fb908031SHong Zhang     /* Double the amount of total space in the list */
192fb908031SHong Zhang     if (current_space->local_remaining < cnzi) {
1939566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
194fb908031SHong Zhang       ndouble++;
195fb908031SHong Zhang     }
196fb908031SHong Zhang 
197fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
1989566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
1992205254eSKarl Rupp 
200fb908031SHong Zhang     current_space->array += cnzi;
201fb908031SHong Zhang     current_space->local_used += cnzi;
202fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2032205254eSKarl Rupp 
204fb908031SHong Zhang     ci[i + 1] = ci[i] + cnzi;
205fb908031SHong Zhang   }
206fb908031SHong Zhang 
207fb908031SHong Zhang   /* Column indices are in the list of free space */
208fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
209fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
2109566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
2119566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
2129566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
213b561aa9dSHong Zhang 
21426be0446SHong Zhang   /* put together the new symbolic matrix */
2159566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
2169566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
21758c24d83SHong Zhang 
21858c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
21958c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
220f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
221aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
222e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
22358c24d83SHong Zhang   c->nonew   = 0;
2244222ddf1SHong Zhang 
2254222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2264222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2270b7e3e3dSHong Zhang 
2287212da7cSHong Zhang   /* set MatInfo */
2297212da7cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
230f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2314222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2324222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2334222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2347212da7cSHong Zhang 
2357212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2367212da7cSHong Zhang   if (ci[am]) {
2379566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
2389566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
239f2b054eeSHong Zhang   } else {
2409566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
241be0fcf8dSHong Zhang   }
242f2b054eeSHong Zhang #endif
2433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24458c24d83SHong Zhang }
245d50806bdSBarry Smith 
MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)246d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
247d71ae5a4SJacob Faibussowitsch {
248d13dce4bSSatish Balay   PetscLogDouble     flops = 0.0;
249d50806bdSBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
250d50806bdSBarry Smith   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
251d50806bdSBarry Smith   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
25238baddfdSBarry Smith   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
253c58ca2e3SHong Zhang   PetscInt           am = A->rmap->n, cm = C->rmap->n;
254519eb980SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
2552e5835c6SStefano Zampini   PetscScalar       *ca, valtmp;
256aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2576718818eSStefano Zampini   PetscContainer     cab_dense;
2582e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
259d50806bdSBarry Smith 
260d50806bdSBarry Smith   PetscFunctionBegin;
2619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
2629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
263aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
2649566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
265aa1aec99SHong Zhang     c->a      = ca;
266aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2676718818eSStefano Zampini   } else ca = c->a;
2686718818eSStefano Zampini 
2696718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2706718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2716718818eSStefano Zampini      that is hard to eradicate) */
2729566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
2736718818eSStefano Zampini   if (!cab_dense) {
2749566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
27549abdd8aSBarry Smith     PetscCall(PetscObjectContainerCompose((PetscObject)C, "__PETSc__ab_dense", ab_dense, PetscCtxDestroyDefault));
276*2a8381b2SBarry Smith   } else PetscCall(PetscContainerGetPointer(cab_dense, &ab_dense));
2779566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
278aa1aec99SHong Zhang 
279c124e916SHong Zhang   /* clean old values in C */
2809566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
281d50806bdSBarry Smith   /* Traverse A row-wise. */
282d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
283d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
284d50806bdSBarry Smith   for (i = 0; i < am; i++) {
285d50806bdSBarry Smith     anzi = ai[i + 1] - ai[i];
286d50806bdSBarry Smith     for (j = 0; j < anzi; j++) {
287519eb980SHong Zhang       brow = aj[j];
288d50806bdSBarry Smith       bnzi = bi[brow + 1] - bi[brow];
2893b51c7ffSStefano Zampini       bjj  = PetscSafePointerPlusOffset(bj, bi[brow]);
2903b51c7ffSStefano Zampini       baj  = PetscSafePointerPlusOffset(ba, bi[brow]);
291519eb980SHong Zhang       /* perform dense axpy */
29236ec6d2dSHong Zhang       valtmp = aa[j];
293ad540459SPierre Jolivet       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
294519eb980SHong Zhang       flops += 2 * bnzi;
295519eb980SHong Zhang     }
2968e3a54c0SPierre Jolivet     aj = PetscSafePointerPlusOffset(aj, anzi);
2978e3a54c0SPierre Jolivet     aa = PetscSafePointerPlusOffset(aa, anzi);
298c58ca2e3SHong Zhang 
299c58ca2e3SHong Zhang     cnzi = ci[i + 1] - ci[i];
300519eb980SHong Zhang     for (k = 0; k < cnzi; k++) {
301519eb980SHong Zhang       ca[k] += ab_dense[cj[k]];
302519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
303519eb980SHong Zhang     }
304519eb980SHong Zhang     flops += cnzi;
3058e3a54c0SPierre Jolivet     cj = PetscSafePointerPlusOffset(cj, cnzi);
3069371c9d4SSatish Balay     ca += cnzi;
307519eb980SHong Zhang   }
3082e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3092e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3102e5835c6SStefano Zampini #endif
3119566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3129566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3139566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
317c58ca2e3SHong Zhang }
318c58ca2e3SHong Zhang 
MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)319d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
320d71ae5a4SJacob Faibussowitsch {
321c58ca2e3SHong Zhang   PetscLogDouble     flops = 0.0;
322c58ca2e3SHong Zhang   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
323c58ca2e3SHong Zhang   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
324c58ca2e3SHong Zhang   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
325c58ca2e3SHong Zhang   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
326c58ca2e3SHong Zhang   PetscInt           am = A->rmap->N, cm = C->rmap->N;
327c58ca2e3SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
3282e5835c6SStefano Zampini   PetscScalar       *ca = c->a, valtmp;
3292e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
330c58ca2e3SHong Zhang   PetscInt           nextb;
331c58ca2e3SHong Zhang 
332c58ca2e3SHong Zhang   PetscFunctionBegin;
3339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
3349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
335cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
3369566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
337cf742fccSHong Zhang     c->a      = ca;
338cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
339cf742fccSHong Zhang   }
340cf742fccSHong Zhang 
341c58ca2e3SHong Zhang   /* clean old values in C */
3429566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
343c58ca2e3SHong Zhang   /* Traverse A row-wise. */
344c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
345c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
346519eb980SHong Zhang   for (i = 0; i < am; i++) {
347519eb980SHong Zhang     anzi = ai[i + 1] - ai[i];
348519eb980SHong Zhang     cnzi = ci[i + 1] - ci[i];
349519eb980SHong Zhang     for (j = 0; j < anzi; j++) {
350519eb980SHong Zhang       brow = aj[j];
351519eb980SHong Zhang       bnzi = bi[brow + 1] - bi[brow];
352519eb980SHong Zhang       bjj  = bj + bi[brow];
353519eb980SHong Zhang       baj  = ba + bi[brow];
354519eb980SHong Zhang       /* perform sparse axpy */
35536ec6d2dSHong Zhang       valtmp = aa[j];
356c124e916SHong Zhang       nextb  = 0;
357c124e916SHong Zhang       for (k = 0; nextb < bnzi; k++) {
358c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
35936ec6d2dSHong Zhang           ca[k] += valtmp * baj[nextb++];
360c124e916SHong Zhang         }
361d50806bdSBarry Smith       }
362d50806bdSBarry Smith       flops += 2 * bnzi;
363d50806bdSBarry Smith     }
3649371c9d4SSatish Balay     aj += anzi;
3659371c9d4SSatish Balay     aa += anzi;
3669371c9d4SSatish Balay     cj += cnzi;
3679371c9d4SSatish Balay     ca += cnzi;
368d50806bdSBarry Smith   }
3692e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3702e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3712e5835c6SStefano Zampini #endif
3729566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3739566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3749566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
378d50806bdSBarry Smith }
379bc011b1eSHong Zhang 
MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)380d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
381d71ae5a4SJacob Faibussowitsch {
38225296bd5SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
38325296bd5SBarry Smith   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
3843c50cad2SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
38525296bd5SBarry Smith   MatScalar         *ca;
38625296bd5SBarry Smith   PetscReal          afill;
387eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
388eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
3890298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
39025296bd5SBarry Smith 
39125296bd5SBarry Smith   PetscFunctionBegin;
3923c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3933c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
3949566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
3953c50cad2SHong Zhang   ci[0] = 0;
3963c50cad2SHong Zhang 
3973c50cad2SHong Zhang   /* create and initialize a linked list */
398eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
399c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
400eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
401eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
402eca6b86aSHong Zhang 
4039566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
4043c50cad2SHong Zhang 
4053c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
4069566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
4073c50cad2SHong Zhang   current_space = free_space;
4083c50cad2SHong Zhang 
4093c50cad2SHong Zhang   /* Determine ci and cj */
4103c50cad2SHong Zhang   for (i = 0; i < am; i++) {
4113c50cad2SHong Zhang     anzi = ai[i + 1] - ai[i];
4123c50cad2SHong Zhang     aj   = a->j + ai[i];
4133c50cad2SHong Zhang     for (j = 0; j < anzi; j++) {
4143c50cad2SHong Zhang       brow = aj[j];
4153c50cad2SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
4163c50cad2SHong Zhang       bj   = b->j + bi[brow];
4173c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4189566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
4193c50cad2SHong Zhang     }
4208c78258cSHong Zhang     /* add possible missing diagonal entry */
42148a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
4223c50cad2SHong Zhang     cnzi = lnk[1];
4233c50cad2SHong Zhang 
4243c50cad2SHong Zhang     /* If free space is not available, make more free space */
4253c50cad2SHong Zhang     /* Double the amount of total space in the list */
4263c50cad2SHong Zhang     if (current_space->local_remaining < cnzi) {
4279566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
4283c50cad2SHong Zhang       ndouble++;
4293c50cad2SHong Zhang     }
4303c50cad2SHong Zhang 
4313c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4329566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
4332205254eSKarl Rupp 
4343c50cad2SHong Zhang     current_space->array += cnzi;
4353c50cad2SHong Zhang     current_space->local_used += cnzi;
4363c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4372205254eSKarl Rupp 
4383c50cad2SHong Zhang     ci[i + 1] = ci[i] + cnzi;
4393c50cad2SHong Zhang   }
4403c50cad2SHong Zhang 
4413c50cad2SHong Zhang   /* Column indices are in the list of free space */
4423c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4433c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
4449566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
4459566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
4469566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_fast(lnk));
44725296bd5SBarry Smith 
44825296bd5SBarry Smith   /* Allocate space for ca */
4499566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
45025296bd5SBarry Smith 
45125296bd5SBarry Smith   /* put together the new symbolic matrix */
4529566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
4539566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
45425296bd5SBarry Smith 
45525296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
45625296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
457f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
45825296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
45925296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
46025296bd5SBarry Smith   c->nonew   = 0;
4612205254eSKarl Rupp 
4624222ddf1SHong Zhang   /* slower, less memory */
4634222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
46425296bd5SBarry Smith 
46525296bd5SBarry Smith   /* set MatInfo */
46625296bd5SBarry Smith   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
46725296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
4684222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4694222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4704222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
47125296bd5SBarry Smith 
47225296bd5SBarry Smith #if defined(PETSC_USE_INFO)
47325296bd5SBarry Smith   if (ci[am]) {
4749566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
4759566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
47625296bd5SBarry Smith   } else {
4779566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
47825296bd5SBarry Smith   }
47925296bd5SBarry Smith #endif
4803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
48125296bd5SBarry Smith }
48225296bd5SBarry Smith 
MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)483d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
484d71ae5a4SJacob Faibussowitsch {
485e9e4536cSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
486bf9555e6SHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
48725c41797SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
488e9e4536cSHong Zhang   MatScalar         *ca;
489e9e4536cSHong Zhang   PetscReal          afill;
490eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
491eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
4920298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
493e9e4536cSHong Zhang 
494e9e4536cSHong Zhang   PetscFunctionBegin;
495bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
496bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
4979566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
498bd958071SHong Zhang   ci[0] = 0;
499bd958071SHong Zhang 
500bd958071SHong Zhang   /* create and initialize a linked list */
501eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
502c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
503eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
504eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
5059566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
506bd958071SHong Zhang 
507bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
5089566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
509bd958071SHong Zhang   current_space = free_space;
510bd958071SHong Zhang 
511bd958071SHong Zhang   /* Determine ci and cj */
512bd958071SHong Zhang   for (i = 0; i < am; i++) {
513bd958071SHong Zhang     anzi = ai[i + 1] - ai[i];
514bd958071SHong Zhang     aj   = a->j + ai[i];
515bd958071SHong Zhang     for (j = 0; j < anzi; j++) {
516bd958071SHong Zhang       brow = aj[j];
517bd958071SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
518bd958071SHong Zhang       bj   = b->j + bi[brow];
519bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
5209566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
521bd958071SHong Zhang     }
5228c78258cSHong Zhang     /* add possible missing diagonal entry */
52348a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
5248c78258cSHong Zhang 
525bd958071SHong Zhang     cnzi = lnk[0];
526bd958071SHong Zhang 
527bd958071SHong Zhang     /* If free space is not available, make more free space */
528bd958071SHong Zhang     /* Double the amount of total space in the list */
529bd958071SHong Zhang     if (current_space->local_remaining < cnzi) {
5309566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
531bd958071SHong Zhang       ndouble++;
532bd958071SHong Zhang     }
533bd958071SHong Zhang 
534bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
5359566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
5362205254eSKarl Rupp 
537bd958071SHong Zhang     current_space->array += cnzi;
538bd958071SHong Zhang     current_space->local_used += cnzi;
539bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5402205254eSKarl Rupp 
541bd958071SHong Zhang     ci[i + 1] = ci[i] + cnzi;
542bd958071SHong Zhang   }
543bd958071SHong Zhang 
544bd958071SHong Zhang   /* Column indices are in the list of free space */
545bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
546bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
5479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
5489566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
5499566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
550e9e4536cSHong Zhang 
551e9e4536cSHong Zhang   /* Allocate space for ca */
5529566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
553e9e4536cSHong Zhang 
554e9e4536cSHong Zhang   /* put together the new symbolic matrix */
5559566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
5569566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
557e9e4536cSHong Zhang 
558e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
559e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
560f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
561e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
562e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
563e9e4536cSHong Zhang   c->nonew   = 0;
5642205254eSKarl Rupp 
5654222ddf1SHong Zhang   /* slower, less memory */
5664222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
567e9e4536cSHong Zhang 
568e9e4536cSHong Zhang   /* set MatInfo */
569e9e4536cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
570e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
5714222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5724222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5734222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
574e9e4536cSHong Zhang 
575e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
576e9e4536cSHong Zhang   if (ci[am]) {
5779566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
5789566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
579e9e4536cSHong Zhang   } else {
5809566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
581e9e4536cSHong Zhang   }
582e9e4536cSHong Zhang #endif
5833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
584e9e4536cSHong Zhang }
585e9e4536cSHong Zhang 
MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)586d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
587d71ae5a4SJacob Faibussowitsch {
5880ced3a2bSJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
5890ced3a2bSJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
5900ced3a2bSJed Brown   PetscInt          *ci, *cj, *bb;
5910ced3a2bSJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
5920ced3a2bSJed Brown   PetscReal          afill;
5930ced3a2bSJed Brown   PetscInt           i, j, col, ndouble = 0;
5940298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
5950ced3a2bSJed Brown   PetscHeap          h;
5960ced3a2bSJed Brown 
5970ced3a2bSJed Brown   PetscFunctionBegin;
598cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
5990ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
6009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
6010ced3a2bSJed Brown   ci[0] = 0;
6020ced3a2bSJed Brown 
6030ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
6049566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
6050ced3a2bSJed Brown   current_space = free_space;
6060ced3a2bSJed Brown 
6079566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
6089566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
6090ced3a2bSJed Brown 
6100ced3a2bSJed Brown   /* Determine ci and cj */
6110ced3a2bSJed Brown   for (i = 0; i < am; i++) {
6120ced3a2bSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6130ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
6140ced3a2bSJed Brown     ci[i + 1]            = ci[i];
6150ced3a2bSJed Brown     /* Populate the min heap */
6160ced3a2bSJed Brown     for (j = 0; j < anzi; j++) {
6170ced3a2bSJed Brown       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
6180ced3a2bSJed Brown       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
6199566063dSJacob Faibussowitsch         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
6200ced3a2bSJed Brown       }
6210ced3a2bSJed Brown     }
6220ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6239566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
6240ced3a2bSJed Brown     while (j >= 0) {
6250ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6269566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
6270ced3a2bSJed Brown         ndouble++;
6280ced3a2bSJed Brown       }
6290ced3a2bSJed Brown       *(current_space->array++) = col;
6300ced3a2bSJed Brown       current_space->local_used++;
6310ced3a2bSJed Brown       current_space->local_remaining--;
6320ced3a2bSJed Brown       ci[i + 1]++;
6330ced3a2bSJed Brown 
6340ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6359566063dSJacob Faibussowitsch       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
6360ced3a2bSJed Brown       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
6370ced3a2bSJed Brown         PetscInt j2, col2;
6389566063dSJacob Faibussowitsch         PetscCall(PetscHeapPeek(h, &j2, &col2));
6390ced3a2bSJed Brown         if (col2 != col) break;
6409566063dSJacob Faibussowitsch         PetscCall(PetscHeapPop(h, &j2, &col2));
6419566063dSJacob Faibussowitsch         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
6420ced3a2bSJed Brown       }
6430ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6449566063dSJacob Faibussowitsch       PetscCall(PetscHeapUnstash(h));
6459566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
6460ced3a2bSJed Brown     }
6470ced3a2bSJed Brown   }
6489566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
6499566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
6500ced3a2bSJed Brown 
6510ced3a2bSJed Brown   /* Column indices are in the list of free space */
6520ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6530ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
6549566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
6559566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
6560ced3a2bSJed Brown 
6570ced3a2bSJed Brown   /* put together the new symbolic matrix */
6589566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
6599566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6600ced3a2bSJed Brown 
6610ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6620ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
663f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
6640ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6650ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6660ced3a2bSJed Brown   c->nonew   = 0;
66726fbe8dcSKarl Rupp 
6684222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6690ced3a2bSJed Brown 
6700ced3a2bSJed Brown   /* set MatInfo */
6710ced3a2bSJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
6720ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6734222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6744222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6754222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6760ced3a2bSJed Brown 
6770ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6780ced3a2bSJed Brown   if (ci[am]) {
6799566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
6809566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
6810ced3a2bSJed Brown   } else {
6829566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
6830ced3a2bSJed Brown   }
6840ced3a2bSJed Brown #endif
6853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6860ced3a2bSJed Brown }
687e9e4536cSHong Zhang 
MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)688d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
689d71ae5a4SJacob Faibussowitsch {
6908a07c6f1SJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
6918a07c6f1SJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
6928a07c6f1SJed Brown   PetscInt          *ci, *cj, *bb;
6938a07c6f1SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
6948a07c6f1SJed Brown   PetscReal          afill;
6958a07c6f1SJed Brown   PetscInt           i, j, col, ndouble = 0;
6960298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
6978a07c6f1SJed Brown   PetscHeap          h;
6988a07c6f1SJed Brown   PetscBT            bt;
6998a07c6f1SJed Brown 
7008a07c6f1SJed Brown   PetscFunctionBegin;
701cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7028a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
7039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
7048a07c6f1SJed Brown   ci[0] = 0;
7058a07c6f1SJed Brown 
7068a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
7079566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
7082205254eSKarl Rupp 
7098a07c6f1SJed Brown   current_space = free_space;
7108a07c6f1SJed Brown 
7119566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
7129566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
7139566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(bn, &bt));
7148a07c6f1SJed Brown 
7158a07c6f1SJed Brown   /* Determine ci and cj */
7168a07c6f1SJed Brown   for (i = 0; i < am; i++) {
7178a07c6f1SJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7188a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
7198a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7208a07c6f1SJed Brown     ci[i + 1]            = ci[i];
7218a07c6f1SJed Brown     /* Populate the min heap */
7228a07c6f1SJed Brown     for (j = 0; j < anzi; j++) {
7238a07c6f1SJed Brown       PetscInt brow = acol[j];
7248a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
7258a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7268a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7279566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7288a07c6f1SJed Brown           bb[j]++;
7298a07c6f1SJed Brown           break;
7308a07c6f1SJed Brown         }
7318a07c6f1SJed Brown       }
7328a07c6f1SJed Brown     }
7338a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7349566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
7358a07c6f1SJed Brown     while (j >= 0) {
7368a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7370298fd71SBarry Smith         fptr = NULL;                            /* need PetscBTMemzero */
7389566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
7398a07c6f1SJed Brown         ndouble++;
7408a07c6f1SJed Brown       }
7418a07c6f1SJed Brown       *(current_space->array++) = col;
7428a07c6f1SJed Brown       current_space->local_used++;
7438a07c6f1SJed Brown       current_space->local_remaining--;
7448a07c6f1SJed Brown       ci[i + 1]++;
7458a07c6f1SJed Brown 
7468a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7478a07c6f1SJed Brown       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
7488a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7498a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7509566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7518a07c6f1SJed Brown           bb[j]++;
7528a07c6f1SJed Brown           break;
7538a07c6f1SJed Brown         }
7548a07c6f1SJed Brown       }
7559566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
7568a07c6f1SJed Brown     }
7578a07c6f1SJed Brown     if (fptr) { /* Clear the bits for this row */
7589566063dSJacob Faibussowitsch       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
7598a07c6f1SJed Brown     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7609566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(bn, bt));
7618a07c6f1SJed Brown     }
7628a07c6f1SJed Brown   }
7639566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
7649566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
7659566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&bt));
7668a07c6f1SJed Brown 
7678a07c6f1SJed Brown   /* Column indices are in the list of free space */
7688a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7698a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
7709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
7719566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
7728a07c6f1SJed Brown 
7738a07c6f1SJed Brown   /* put together the new symbolic matrix */
7749566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
7759566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
7768a07c6f1SJed Brown 
7778a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7788a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
779f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
7808a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7818a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7828a07c6f1SJed Brown   c->nonew   = 0;
78326fbe8dcSKarl Rupp 
7844222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7858a07c6f1SJed Brown 
7868a07c6f1SJed Brown   /* set MatInfo */
7878a07c6f1SJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
7888a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7894222ddf1SHong Zhang   C->info.mallocs           = ndouble;
7904222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
7914222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
7928a07c6f1SJed Brown 
7938a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
7948a07c6f1SJed Brown   if (ci[am]) {
7959566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
7969566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
7978a07c6f1SJed Brown   } else {
7989566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
7998a07c6f1SJed Brown   }
8008a07c6f1SJed Brown #endif
8013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
8028a07c6f1SJed Brown }
8038a07c6f1SJed Brown 
MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)804d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
805d71ae5a4SJacob Faibussowitsch {
806d7ed1a4dSandi selinger   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
807d7ed1a4dSandi selinger   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
808d7ed1a4dSandi selinger   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
809d7ed1a4dSandi selinger   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
810d7ed1a4dSandi selinger   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
811d7ed1a4dSandi selinger   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
812d7ed1a4dSandi selinger   const PetscInt *brow_ptr[8], *brow_end[8];
813d7ed1a4dSandi selinger   PetscInt        window[8];
814d7ed1a4dSandi selinger   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
815d7ed1a4dSandi selinger   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
816d7ed1a4dSandi selinger   PetscReal       afill;
817f83700f2Sandi selinger   PetscInt       *workj_L1, *workj_L2, *workj_L3;
8187660c4dbSandi selinger   PetscInt        L1_nnz, L2_nnz;
819d7ed1a4dSandi selinger 
820d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
821d7ed1a4dSandi selinger              Because of the way virtual memory works,
822d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
823d7ed1a4dSandi selinger   PetscFunctionBegin;
8249566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
825d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
826d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
827d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
828d7ed1a4dSandi selinger     a_rownnz             = 0;
829d7ed1a4dSandi selinger     for (k = 0; k < anzi; ++k) {
830d7ed1a4dSandi selinger       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
831d7ed1a4dSandi selinger       if (a_rownnz > bn) {
832d7ed1a4dSandi selinger         a_rownnz = bn;
833d7ed1a4dSandi selinger         break;
834d7ed1a4dSandi selinger       }
835d7ed1a4dSandi selinger     }
836d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
837d7ed1a4dSandi selinger   }
838d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
8399566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
8409566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
8419566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
842d7ed1a4dSandi selinger 
8437660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8447660c4dbSandi selinger   c_maxmem = 8 * (ai[am] + bi[bm]);
845d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
8469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c_maxmem, &cj));
847d7ed1a4dSandi selinger 
848d7ed1a4dSandi selinger   ci_nnz      = 0;
849d7ed1a4dSandi selinger   ci[0]       = 0;
850d7ed1a4dSandi selinger   worki_L1[0] = 0;
851d7ed1a4dSandi selinger   worki_L2[0] = 0;
852d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
853d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
854d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
855d7ed1a4dSandi selinger     rowsleft             = anzi;
856d7ed1a4dSandi selinger     inputcol_L1          = acol;
857d7ed1a4dSandi selinger     L2_nnz               = 0;
8587660c4dbSandi selinger     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8597660c4dbSandi selinger     worki_L2[1]          = 0;
86008fe1b3cSKarl Rupp     outputi_nnz          = 0;
861d7ed1a4dSandi selinger 
862d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
863d7ed1a4dSandi selinger     while (ci_nnz + a_maxrownnz > c_maxmem) {
864d7ed1a4dSandi selinger       c_maxmem *= 2;
865d7ed1a4dSandi selinger       ndouble++;
8669566063dSJacob Faibussowitsch       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
867d7ed1a4dSandi selinger     }
868d7ed1a4dSandi selinger 
869d7ed1a4dSandi selinger     while (rowsleft) {
870d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
871d7ed1a4dSandi selinger       L1_nrows    = 0;
8727660c4dbSandi selinger       L1_nnz      = 0;
873d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
874d7ed1a4dSandi selinger       inputi      = bi;
875d7ed1a4dSandi selinger       inputj      = bj;
876d7ed1a4dSandi selinger 
877d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
878d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
879f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
880d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
881d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
882a8f51744SPierre Jolivet   do { \
883d7ed1a4dSandi selinger     window_min  = bn; \
8847660c4dbSandi selinger     outputi_nnz = 0; \
8857660c4dbSandi selinger     for (k = 0; k < ANNZ; ++k) { \
886d7ed1a4dSandi selinger       brow_ptr[k] = inputj + inputi[inputcol[k]]; \
887d7ed1a4dSandi selinger       brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
888d7ed1a4dSandi selinger       window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
889d7ed1a4dSandi selinger       window_min  = PetscMin(window[k], window_min); \
890d7ed1a4dSandi selinger     } \
891d7ed1a4dSandi selinger     while (window_min < bn) { \
892d7ed1a4dSandi selinger       outputj[outputi_nnz++] = window_min; \
893d7ed1a4dSandi selinger       /* advance front and compute new minimum */ \
894d7ed1a4dSandi selinger       old_window_min = window_min; \
895d7ed1a4dSandi selinger       window_min     = bn; \
896d7ed1a4dSandi selinger       for (k = 0; k < ANNZ; ++k) { \
897d7ed1a4dSandi selinger         if (window[k] == old_window_min) { \
898d7ed1a4dSandi selinger           brow_ptr[k]++; \
899d7ed1a4dSandi selinger           window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
900d7ed1a4dSandi selinger         } \
901d7ed1a4dSandi selinger         window_min = PetscMin(window[k], window_min); \
902d7ed1a4dSandi selinger       } \
903a8f51744SPierre Jolivet     } \
904a8f51744SPierre Jolivet   } while (0)
905d7ed1a4dSandi selinger 
906d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
907d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
908d7ed1a4dSandi selinger       while (L1_rowsleft) {
9097660c4dbSandi selinger         outputi_nnz = 0;
9107660c4dbSandi selinger         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
9117660c4dbSandi selinger         else outputj = cj + ci_nnz;                /* Merge directly to C */
9127660c4dbSandi selinger 
913d7ed1a4dSandi selinger         switch (L1_rowsleft) {
9149371c9d4SSatish Balay         case 1:
9159371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
916d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
917d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
918d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
919d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
920d7ed1a4dSandi selinger           L1_rowsleft = 0;
921d7ed1a4dSandi selinger           break;
9229371c9d4SSatish Balay         case 2:
9239371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(2);
924d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
925d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
926d7ed1a4dSandi selinger           L1_rowsleft = 0;
927d7ed1a4dSandi selinger           break;
9289371c9d4SSatish Balay         case 3:
9299371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(3);
930d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
931d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
932d7ed1a4dSandi selinger           L1_rowsleft = 0;
933d7ed1a4dSandi selinger           break;
9349371c9d4SSatish Balay         case 4:
9359371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(4);
936d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
937d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
938d7ed1a4dSandi selinger           L1_rowsleft = 0;
939d7ed1a4dSandi selinger           break;
9409371c9d4SSatish Balay         case 5:
9419371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(5);
942d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
943d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
944d7ed1a4dSandi selinger           L1_rowsleft = 0;
945d7ed1a4dSandi selinger           break;
9469371c9d4SSatish Balay         case 6:
9479371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(6);
948d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
949d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
950d7ed1a4dSandi selinger           L1_rowsleft = 0;
951d7ed1a4dSandi selinger           break;
9529371c9d4SSatish Balay         case 7:
9539371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(7);
954d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
955d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
956d7ed1a4dSandi selinger           L1_rowsleft = 0;
957d7ed1a4dSandi selinger           break;
9589371c9d4SSatish Balay         default:
9599371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(8);
960d7ed1a4dSandi selinger           inputcol += 8;
961d7ed1a4dSandi selinger           rowsleft -= 8;
962d7ed1a4dSandi selinger           L1_rowsleft -= 8;
963d7ed1a4dSandi selinger           break;
964d7ed1a4dSandi selinger         }
965d7ed1a4dSandi selinger         inputcol_L1 = inputcol;
9667660c4dbSandi selinger         L1_nnz += outputi_nnz;
9677660c4dbSandi selinger         worki_L1[++L1_nrows] = L1_nnz;
968d7ed1a4dSandi selinger       }
969d7ed1a4dSandi selinger 
970d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
971d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
972d7ed1a4dSandi selinger       if (anzi > 8) {
973d7ed1a4dSandi selinger         inputi      = worki_L1;
974d7ed1a4dSandi selinger         inputj      = workj_L1;
975d7ed1a4dSandi selinger         inputcol    = workcol;
976d7ed1a4dSandi selinger         outputi_nnz = 0;
977d7ed1a4dSandi selinger 
978d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
979d7ed1a4dSandi selinger         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */
980d7ed1a4dSandi selinger 
981d7ed1a4dSandi selinger         switch (L1_nrows) {
9829371c9d4SSatish Balay         case 1:
9839371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
984d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
985d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
986d7ed1a4dSandi selinger           break;
987d71ae5a4SJacob Faibussowitsch         case 2:
988d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(2);
989d71ae5a4SJacob Faibussowitsch           break;
990d71ae5a4SJacob Faibussowitsch         case 3:
991d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(3);
992d71ae5a4SJacob Faibussowitsch           break;
993d71ae5a4SJacob Faibussowitsch         case 4:
994d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(4);
995d71ae5a4SJacob Faibussowitsch           break;
996d71ae5a4SJacob Faibussowitsch         case 5:
997d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(5);
998d71ae5a4SJacob Faibussowitsch           break;
999d71ae5a4SJacob Faibussowitsch         case 6:
1000d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(6);
1001d71ae5a4SJacob Faibussowitsch           break;
1002d71ae5a4SJacob Faibussowitsch         case 7:
1003d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(7);
1004d71ae5a4SJacob Faibussowitsch           break;
1005d71ae5a4SJacob Faibussowitsch         case 8:
1006d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(8);
1007d71ae5a4SJacob Faibussowitsch           break;
1008d71ae5a4SJacob Faibussowitsch         default:
1009d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1010d7ed1a4dSandi selinger         }
1011d7ed1a4dSandi selinger         L2_nnz += outputi_nnz;
1012d7ed1a4dSandi selinger         worki_L2[++L2_nrows] = L2_nnz;
1013d7ed1a4dSandi selinger 
10147660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10157660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1016d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1017d7ed1a4dSandi selinger           inputi      = worki_L2;
1018d7ed1a4dSandi selinger           inputj      = workj_L2;
1019d7ed1a4dSandi selinger           inputcol    = workcol;
1020d7ed1a4dSandi selinger           outputi_nnz = 0;
1021f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1022d7ed1a4dSandi selinger           else outputj = cj + ci_nnz;
1023d7ed1a4dSandi selinger           switch (L2_nrows) {
10249371c9d4SSatish Balay           case 1:
10259371c9d4SSatish Balay             brow_ptr[0] = inputj + inputi[inputcol[0]];
1026d7ed1a4dSandi selinger             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1027d7ed1a4dSandi selinger             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1028d7ed1a4dSandi selinger             break;
1029d71ae5a4SJacob Faibussowitsch           case 2:
1030d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(2);
1031d71ae5a4SJacob Faibussowitsch             break;
1032d71ae5a4SJacob Faibussowitsch           case 3:
1033d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(3);
1034d71ae5a4SJacob Faibussowitsch             break;
1035d71ae5a4SJacob Faibussowitsch           case 4:
1036d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(4);
1037d71ae5a4SJacob Faibussowitsch             break;
1038d71ae5a4SJacob Faibussowitsch           case 5:
1039d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(5);
1040d71ae5a4SJacob Faibussowitsch             break;
1041d71ae5a4SJacob Faibussowitsch           case 6:
1042d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(6);
1043d71ae5a4SJacob Faibussowitsch             break;
1044d71ae5a4SJacob Faibussowitsch           case 7:
1045d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(7);
1046d71ae5a4SJacob Faibussowitsch             break;
1047d71ae5a4SJacob Faibussowitsch           case 8:
1048d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(8);
1049d71ae5a4SJacob Faibussowitsch             break;
1050d71ae5a4SJacob Faibussowitsch           default:
1051d71ae5a4SJacob Faibussowitsch             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1052d7ed1a4dSandi selinger           }
1053d7ed1a4dSandi selinger           L2_nrows    = 1;
10547660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10557660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10567660c4dbSandi selinger           /* Copy to workj_L2 */
1057d7ed1a4dSandi selinger           if (rowsleft) {
10587660c4dbSandi selinger             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1059d7ed1a4dSandi selinger           }
1060d7ed1a4dSandi selinger         }
1061d7ed1a4dSandi selinger       }
1062d7ed1a4dSandi selinger     } /* while (rowsleft) */
1063d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1064d7ed1a4dSandi selinger 
1065d7ed1a4dSandi selinger     /* terminate current row */
1066d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1067d7ed1a4dSandi selinger     ci[i + 1] = ci_nnz;
1068d7ed1a4dSandi selinger   }
1069d7ed1a4dSandi selinger 
1070d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
10719566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
10729566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1073d7ed1a4dSandi selinger 
1074d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1075d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1076f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
1077d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1078d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1079d7ed1a4dSandi selinger   c->nonew   = 0;
1080d7ed1a4dSandi selinger 
10814222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1082d7ed1a4dSandi selinger 
1083d7ed1a4dSandi selinger   /* set MatInfo */
1084d7ed1a4dSandi selinger   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1085d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
10864222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10874222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10884222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1089d7ed1a4dSandi selinger 
1090d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1091d7ed1a4dSandi selinger   if (ci[am]) {
10929566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
10939566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1094d7ed1a4dSandi selinger   } else {
10959566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1096d7ed1a4dSandi selinger   }
1097d7ed1a4dSandi selinger #endif
1098d7ed1a4dSandi selinger 
1099d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
11009566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L1));
11019566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L2));
11029566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L3));
11033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1104d7ed1a4dSandi selinger }
1105d7ed1a4dSandi selinger 
1106cd093f1dSJed Brown /* concatenate unique entries and then sort */
MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)1107d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1108d71ae5a4SJacob Faibussowitsch {
1109cd093f1dSJed Brown   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1110cd093f1dSJed Brown   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
11118c78258cSHong Zhang   PetscInt       *ci, *cj, bcol;
1112cd093f1dSJed Brown   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1113cd093f1dSJed Brown   PetscReal       afill;
1114cd093f1dSJed Brown   PetscInt        i, j, ndouble = 0;
1115cd093f1dSJed Brown   PetscSegBuffer  seg, segrow;
1116cd093f1dSJed Brown   char           *seen;
1117cd093f1dSJed Brown 
1118cd093f1dSJed Brown   PetscFunctionBegin;
11199566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
1120cd093f1dSJed Brown   ci[0] = 0;
1121cd093f1dSJed Brown 
1122cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
11239566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
11249566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
11259566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(bn, &seen));
1126cd093f1dSJed Brown 
1127cd093f1dSJed Brown   /* Determine ci and cj */
1128cd093f1dSJed Brown   for (i = 0; i < am; i++) {
1129cd093f1dSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];                     /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
11308e3a54c0SPierre Jolivet     const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */
1131cd093f1dSJed Brown     PetscInt packlen     = 0, *PETSC_RESTRICT crow;
11328c78258cSHong Zhang 
1133cd093f1dSJed Brown     /* Pack segrow */
1134cd093f1dSJed Brown     for (j = 0; j < anzi; j++) {
1135cd093f1dSJed Brown       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1136cd093f1dSJed Brown       for (k = bjstart; k < bjend; k++) {
11378c78258cSHong Zhang         bcol = bj[k];
1138cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1139cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
11409566063dSJacob Faibussowitsch           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1141cd093f1dSJed Brown           *slot      = bcol;
1142cd093f1dSJed Brown           seen[bcol] = 1;
1143cd093f1dSJed Brown           packlen++;
1144cd093f1dSJed Brown         }
1145cd093f1dSJed Brown       }
1146cd093f1dSJed Brown     }
11478c78258cSHong Zhang 
11488c78258cSHong Zhang     /* Check i-th diagonal entry */
11498c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11508c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11519566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
11528c78258cSHong Zhang       *slot   = i;
11538c78258cSHong Zhang       seen[i] = 1;
11548c78258cSHong Zhang       packlen++;
11558c78258cSHong Zhang     }
11568c78258cSHong Zhang 
11579566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
11589566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferExtractTo(segrow, crow));
11599566063dSJacob Faibussowitsch     PetscCall(PetscSortInt(packlen, crow));
1160cd093f1dSJed Brown     ci[i + 1] = ci[i] + packlen;
1161cd093f1dSJed Brown     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1162cd093f1dSJed Brown   }
11639566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrow));
11649566063dSJacob Faibussowitsch   PetscCall(PetscFree(seen));
1165cd093f1dSJed Brown 
1166cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
11679566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
11689566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&seg));
1169cd093f1dSJed Brown 
1170cd093f1dSJed Brown   /* put together the new symbolic matrix */
11719566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
11729566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1173cd093f1dSJed Brown 
1174cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1175cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1176f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
1177cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1178cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1179cd093f1dSJed Brown   c->nonew   = 0;
1180cd093f1dSJed Brown 
11814222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1182cd093f1dSJed Brown 
1183cd093f1dSJed Brown   /* set MatInfo */
11842a09556fSStefano Zampini   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1185cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
11864222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11874222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11884222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1189cd093f1dSJed Brown 
1190cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1191cd093f1dSJed Brown   if (ci[am]) {
11929566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
11939566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1194cd093f1dSJed Brown   } else {
11959566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1196cd093f1dSJed Brown   }
1197cd093f1dSJed Brown #endif
11983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1199cd093f1dSJed Brown }
1200cd093f1dSJed Brown 
MatProductCtxDestroy_SeqAIJ_MatMatMultTrans(PetscCtxRt data)1201*2a8381b2SBarry Smith static PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatMatMultTrans(PetscCtxRt data)
1202d71ae5a4SJacob Faibussowitsch {
1203cc1eb50dSBarry Smith   MatProductCtx_MatMatTransMult *abt = *(MatProductCtx_MatMatTransMult **)data;
12042128a86cSHong Zhang 
12052128a86cSHong Zhang   PetscFunctionBegin;
12069566063dSJacob Faibussowitsch   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
12079566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->Bt_den));
12089566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->ABt_den));
12099566063dSJacob Faibussowitsch   PetscCall(PetscFree(abt));
12103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12112128a86cSHong Zhang }
12122128a86cSHong Zhang 
MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)1213d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1214d71ae5a4SJacob Faibussowitsch {
121581d82fe4SHong Zhang   Mat                            Bt;
1216cc1eb50dSBarry Smith   MatProductCtx_MatMatTransMult *abt;
12174222ddf1SHong Zhang   Mat_Product                   *product = C->product;
12186718818eSStefano Zampini   char                          *alg;
1219d2853540SHong Zhang 
122081d82fe4SHong Zhang   PetscFunctionBegin;
122128b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
122228b400f6SJacob Faibussowitsch   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
12236718818eSStefano Zampini 
122481d82fe4SHong Zhang   /* create symbolic Bt */
12257fb60732SBarry Smith   PetscCall(MatTransposeSymbolic(B, &Bt));
122658b7e2c1SStefano Zampini   PetscCall(MatSetBlockSizes(Bt, A->cmap->bs, B->cmap->bs));
12279566063dSJacob Faibussowitsch   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
122881d82fe4SHong Zhang 
122981d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12309566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(product->alg, &alg));
12319566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
12329566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
12339566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
12349566063dSJacob Faibussowitsch   PetscCall(PetscFree(alg));
123581d82fe4SHong Zhang 
1236a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
12379566063dSJacob Faibussowitsch   PetscCall(PetscNew(&abt));
12382128a86cSHong Zhang 
12396718818eSStefano Zampini   product->data    = abt;
1240cc1eb50dSBarry Smith   product->destroy = MatProductCtxDestroy_SeqAIJ_MatMatMultTrans;
12416718818eSStefano Zampini 
12424222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12432128a86cSHong Zhang 
12444222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12459566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
124640192850SHong Zhang   if (abt->usecoloring) {
1247b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1248b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1249335efc43SPeter Brune     MatColoring          coloring;
12502128a86cSHong Zhang     ISColoring           iscoloring;
12512128a86cSHong Zhang     Mat                  Bt_dense, C_dense;
1252e8354b3eSHong Zhang 
12534222ddf1SHong Zhang     /* inode causes memory problem */
12549566063dSJacob Faibussowitsch     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
12554222ddf1SHong Zhang 
12569566063dSJacob Faibussowitsch     PetscCall(MatColoringCreate(C, &coloring));
12579566063dSJacob Faibussowitsch     PetscCall(MatColoringSetDistance(coloring, 2));
12589566063dSJacob Faibussowitsch     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
12599566063dSJacob Faibussowitsch     PetscCall(MatColoringSetFromOptions(coloring));
12609566063dSJacob Faibussowitsch     PetscCall(MatColoringApply(coloring, &iscoloring));
12619566063dSJacob Faibussowitsch     PetscCall(MatColoringDestroy(&coloring));
12629566063dSJacob Faibussowitsch     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
12632205254eSKarl Rupp 
126440192850SHong Zhang     abt->matcoloring = matcoloring;
12652205254eSKarl Rupp 
12669566063dSJacob Faibussowitsch     PetscCall(ISColoringDestroy(&iscoloring));
12672128a86cSHong Zhang 
12682128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12699566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
12709566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
12719566063dSJacob Faibussowitsch     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
12729566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
12732205254eSKarl Rupp 
12742128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
127540192850SHong Zhang     abt->Bt_den         = Bt_dense;
12762128a86cSHong Zhang 
12779566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
12789566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
12799566063dSJacob Faibussowitsch     PetscCall(MatSetType(C_dense, MATSEQDENSE));
12809566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
12812205254eSKarl Rupp 
12822128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
128340192850SHong Zhang     abt->ABt_den        = C_dense;
1284f94ccd6cSHong Zhang 
1285f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12861ce71dffSSatish Balay     {
12874222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
12889371c9d4SSatish Balay       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1289f4f49eeaSPierre Jolivet                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
12901ce71dffSSatish Balay     }
1291f94ccd6cSHong Zhang #endif
12922128a86cSHong Zhang   }
1293e99be685SHong Zhang   /* clean up */
12949566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&Bt));
12953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12965df89d91SHong Zhang }
12975df89d91SHong Zhang 
MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)1298d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1299d71ae5a4SJacob Faibussowitsch {
13005df89d91SHong Zhang   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1301e2cac8caSJed Brown   PetscInt                      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
130281d82fe4SHong Zhang   PetscInt                       cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
13035df89d91SHong Zhang   PetscLogDouble                 flops = 0.0;
1304aa1aec99SHong Zhang   MatScalar                     *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
1305cc1eb50dSBarry Smith   MatProductCtx_MatMatTransMult *abt;
13066718818eSStefano Zampini   Mat_Product                   *product = C->product;
13075df89d91SHong Zhang 
13085df89d91SHong Zhang   PetscFunctionBegin;
130928b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1310cc1eb50dSBarry Smith   abt = (MatProductCtx_MatMatTransMult *)product->data;
131128b400f6SJacob Faibussowitsch   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
131258ed3ceaSHong Zhang   /* clear old values in C */
131358ed3ceaSHong Zhang   if (!c->a) {
13149566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
131558ed3ceaSHong Zhang     c->a      = ca;
131658ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
131758ed3ceaSHong Zhang   } else {
131858ed3ceaSHong Zhang     ca = c->a;
13199566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
132058ed3ceaSHong Zhang   }
132158ed3ceaSHong Zhang 
132240192850SHong Zhang   if (abt->usecoloring) {
132340192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
132440192850SHong Zhang     Mat                  Bt_dense, C_dense = abt->ABt_den;
1325c8db22f6SHong Zhang 
1326b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
132740192850SHong Zhang     Bt_dense = abt->Bt_den;
13289566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1329c8db22f6SHong Zhang 
1330c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13319566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1332c8db22f6SHong Zhang 
13332128a86cSHong Zhang     /* Recover C from C_dense */
13349566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
13353ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1336c8db22f6SHong Zhang   }
1337c8db22f6SHong Zhang 
13381fa1209cSHong Zhang   for (i = 0; i < cm; i++) {
133981d82fe4SHong Zhang     anzi = ai[i + 1] - ai[i];
13408e3a54c0SPierre Jolivet     acol = PetscSafePointerPlusOffset(aj, ai[i]);
13418e3a54c0SPierre Jolivet     aval = PetscSafePointerPlusOffset(aa, ai[i]);
13421fa1209cSHong Zhang     cnzi = ci[i + 1] - ci[i];
13438e3a54c0SPierre Jolivet     ccol = PetscSafePointerPlusOffset(cj, ci[i]);
13446973769bSHong Zhang     cval = ca + ci[i];
13451fa1209cSHong Zhang     for (j = 0; j < cnzi; j++) {
134681d82fe4SHong Zhang       brow = ccol[j];
134781d82fe4SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
134881d82fe4SHong Zhang       bcol = bj + bi[brow];
13496973769bSHong Zhang       bval = ba + bi[brow];
13506973769bSHong Zhang 
135181d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
13529371c9d4SSatish Balay       nexta = 0;
13539371c9d4SSatish Balay       nextb = 0;
135481d82fe4SHong Zhang       while (nexta < anzi && nextb < bnzj) {
13557b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
135681d82fe4SHong Zhang         if (nexta == anzi) break;
13577b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
135881d82fe4SHong Zhang         if (nextb == bnzj) break;
135981d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13606973769bSHong Zhang           cval[j] += aval[nexta] * bval[nextb];
13619371c9d4SSatish Balay           nexta++;
13629371c9d4SSatish Balay           nextb++;
136381d82fe4SHong Zhang           flops += 2;
13641fa1209cSHong Zhang         }
13651fa1209cSHong Zhang       }
136681d82fe4SHong Zhang     }
136781d82fe4SHong Zhang   }
13689566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
13699566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
13709566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
13713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13725df89d91SHong Zhang }
13735df89d91SHong Zhang 
MatProductCtxDestroy_SeqAIJ_MatTransMatMult(PetscCtxRt data)1374*2a8381b2SBarry Smith PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatTransMatMult(PetscCtxRt data)
1375d71ae5a4SJacob Faibussowitsch {
1376cc1eb50dSBarry Smith   MatProductCtx_MatTransMatMult *atb = *(MatProductCtx_MatTransMatMult **)data;
13776d373c3eSHong Zhang 
13786d373c3eSHong Zhang   PetscFunctionBegin;
13799566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&atb->At));
1380cc1eb50dSBarry Smith   if (atb->destroy) PetscCall((*atb->destroy)(&atb->data));
13819566063dSJacob Faibussowitsch   PetscCall(PetscFree(atb));
13823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13836d373c3eSHong Zhang }
13846d373c3eSHong Zhang 
MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)1385d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1386d71ae5a4SJacob Faibussowitsch {
1387089a957eSStefano Zampini   Mat          At      = NULL;
13884222ddf1SHong Zhang   Mat_Product *product = C->product;
1389089a957eSStefano Zampini   PetscBool    flg, def, square;
1390bc011b1eSHong Zhang 
1391bc011b1eSHong Zhang   PetscFunctionBegin;
1392089a957eSStefano Zampini   MatCheckProduct(C, 4);
1393b94d7dedSBarry Smith   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
13944222ddf1SHong Zhang   /* outerproduct */
13959566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
13964222ddf1SHong Zhang   if (flg) {
1397bc011b1eSHong Zhang     /* create symbolic At */
1398089a957eSStefano Zampini     if (!square) {
13997fb60732SBarry Smith       PetscCall(MatTransposeSymbolic(A, &At));
140058b7e2c1SStefano Zampini       PetscCall(MatSetBlockSizes(At, A->cmap->bs, B->cmap->bs));
14019566063dSJacob Faibussowitsch       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1402089a957eSStefano Zampini     }
1403bc011b1eSHong Zhang     /* get symbolic C=At*B */
14049566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14059566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1406bc011b1eSHong Zhang 
1407bc011b1eSHong Zhang     /* clean up */
140848a46eb9SPierre Jolivet     if (!square) PetscCall(MatDestroy(&At));
14096d373c3eSHong Zhang 
14104222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
14119566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
14123ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14134222ddf1SHong Zhang   }
14144222ddf1SHong Zhang 
14154222ddf1SHong Zhang   /* matmatmult */
14169566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &def));
14179566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1418089a957eSStefano Zampini   if (flg || def) {
1419cc1eb50dSBarry Smith     MatProductCtx_MatTransMatMult *atb;
14204222ddf1SHong Zhang 
142128b400f6SJacob Faibussowitsch     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
14229566063dSJacob Faibussowitsch     PetscCall(PetscNew(&atb));
142348a46eb9SPierre Jolivet     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
14249566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14259566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
14269566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "at*b"));
14276718818eSStefano Zampini     product->data    = atb;
1428cc1eb50dSBarry Smith     product->destroy = MatProductCtxDestroy_SeqAIJ_MatTransMatMult;
14294222ddf1SHong Zhang     atb->At          = At;
14304222ddf1SHong Zhang 
14314222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14323ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14334222ddf1SHong Zhang   }
14344222ddf1SHong Zhang 
14354222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1436bc011b1eSHong Zhang }
1437bc011b1eSHong Zhang 
MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)1438d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1439d71ae5a4SJacob Faibussowitsch {
14400fbc74f4SHong Zhang   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1441d0f46423SBarry Smith   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1442d0f46423SBarry Smith   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1443d13dce4bSSatish Balay   PetscLogDouble flops = 0.0;
1444aa1aec99SHong Zhang   MatScalar     *aa    = a->a, *ba, *ca, *caj;
1445bc011b1eSHong Zhang 
1446bc011b1eSHong Zhang   PetscFunctionBegin;
1447aa1aec99SHong Zhang   if (!c->a) {
14489566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
14492205254eSKarl Rupp 
1450aa1aec99SHong Zhang     c->a      = ca;
1451aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1452aa1aec99SHong Zhang   } else {
1453aa1aec99SHong Zhang     ca = c->a;
14549566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm]));
1455aa1aec99SHong Zhang   }
1456bc011b1eSHong Zhang 
1457bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1458bc011b1eSHong Zhang   for (i = 0; i < am; i++) {
1459bc011b1eSHong Zhang     bj   = b->j + bi[i];
1460bc011b1eSHong Zhang     ba   = b->a + bi[i];
1461bc011b1eSHong Zhang     bnzi = bi[i + 1] - bi[i];
1462bc011b1eSHong Zhang     anzi = ai[i + 1] - ai[i];
1463bc011b1eSHong Zhang     for (j = 0; j < anzi; j++) {
1464bc011b1eSHong Zhang       nextb = 0;
14650fbc74f4SHong Zhang       crow  = *aj++;
1466bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1467bc011b1eSHong Zhang       caj   = ca + ci[crow];
1468bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1469bc011b1eSHong Zhang       for (k = 0; nextb < bnzi; k++) {
14700fbc74f4SHong Zhang         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
14710fbc74f4SHong Zhang           caj[k] += (*aa) * (*(ba + nextb));
1472bc011b1eSHong Zhang           nextb++;
1473bc011b1eSHong Zhang         }
1474bc011b1eSHong Zhang       }
1475bc011b1eSHong Zhang       flops += 2 * bnzi;
14760fbc74f4SHong Zhang       aa++;
1477bc011b1eSHong Zhang     }
1478bc011b1eSHong Zhang   }
1479bc011b1eSHong Zhang 
1480bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
14819566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
14829566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
14839566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
14843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1485bc011b1eSHong Zhang }
14869123193aSHong Zhang 
MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)1487d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1488d71ae5a4SJacob Faibussowitsch {
14899123193aSHong Zhang   PetscFunctionBegin;
14909566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
14914222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
14939123193aSHong Zhang }
14949123193aSHong Zhang 
MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)1495d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1496d71ae5a4SJacob Faibussowitsch {
1497f32d5d43SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
14981ca9667aSStefano Zampini   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1499a4af7ca8SStefano Zampini   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1500daf57211SHong Zhang   const PetscInt    *aj;
150175f6d85dSStefano Zampini   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
150275f6d85dSStefano Zampini   PetscInt           clda;
150375f6d85dSStefano Zampini   PetscInt           am4, bm4, col, i, j, n;
15049123193aSHong Zhang 
15059123193aSHong Zhang   PetscFunctionBegin;
15063ba16761SJacob Faibussowitsch   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
15079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &av));
150893aa15f2SStefano Zampini   if (add) {
15099566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArray(C, &c));
151093aa15f2SStefano Zampini   } else {
15119566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArrayWrite(C, &c));
151293aa15f2SStefano Zampini   }
15139566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(B, &b));
15149566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &bm));
15159566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(C, &clda));
151675f6d85dSStefano Zampini   am4 = 4 * clda;
151775f6d85dSStefano Zampini   bm4 = 4 * bm;
15185c0db29aSPierre Jolivet   if (b) {
15199371c9d4SSatish Balay     b1 = b;
15209371c9d4SSatish Balay     b2 = b1 + bm;
15219371c9d4SSatish Balay     b3 = b2 + bm;
15229371c9d4SSatish Balay     b4 = b3 + bm;
15235c0db29aSPierre Jolivet   } else b1 = b2 = b3 = b4 = NULL;
15249371c9d4SSatish Balay   c1 = c;
15259371c9d4SSatish Balay   c2 = c1 + clda;
15269371c9d4SSatish Balay   c3 = c2 + clda;
15279371c9d4SSatish Balay   c4 = c3 + clda;
15281ca9667aSStefano Zampini   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
15291ca9667aSStefano Zampini     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1530f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1531f32d5d43SBarry Smith       n                 = a->i[i + 1] - a->i[i];
15328e3a54c0SPierre Jolivet       aj                = PetscSafePointerPlusOffset(a->j, a->i[i]);
15338e3a54c0SPierre Jolivet       aa                = PetscSafePointerPlusOffset(av, a->i[i]);
1534f32d5d43SBarry Smith       for (j = 0; j < n; j++) {
15351ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15361ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1537730858b9SHong Zhang         r1 += aatmp * b1[ajtmp];
1538730858b9SHong Zhang         r2 += aatmp * b2[ajtmp];
1539730858b9SHong Zhang         r3 += aatmp * b3[ajtmp];
1540730858b9SHong Zhang         r4 += aatmp * b4[ajtmp];
15419123193aSHong Zhang       }
154293aa15f2SStefano Zampini       if (add) {
154387753ddeSHong Zhang         c1[i] += r1;
154487753ddeSHong Zhang         c2[i] += r2;
154587753ddeSHong Zhang         c3[i] += r3;
154687753ddeSHong Zhang         c4[i] += r4;
154793aa15f2SStefano Zampini       } else {
154893aa15f2SStefano Zampini         c1[i] = r1;
154993aa15f2SStefano Zampini         c2[i] = r2;
155093aa15f2SStefano Zampini         c3[i] = r3;
155193aa15f2SStefano Zampini         c4[i] = r4;
155293aa15f2SStefano Zampini       }
1553f32d5d43SBarry Smith     }
15545c0db29aSPierre Jolivet     if (b) {
15559371c9d4SSatish Balay       b1 += bm4;
15569371c9d4SSatish Balay       b2 += bm4;
15579371c9d4SSatish Balay       b3 += bm4;
15589371c9d4SSatish Balay       b4 += bm4;
15595c0db29aSPierre Jolivet     }
15609371c9d4SSatish Balay     c1 += am4;
15619371c9d4SSatish Balay     c2 += am4;
15629371c9d4SSatish Balay     c3 += am4;
15639371c9d4SSatish Balay     c4 += am4;
1564f32d5d43SBarry Smith   }
156593aa15f2SStefano Zampini   /* process remaining columns */
156693aa15f2SStefano Zampini   if (col != cn) {
156793aa15f2SStefano Zampini     PetscInt rc = cn - col;
156893aa15f2SStefano Zampini 
156993aa15f2SStefano Zampini     if (rc == 1) {
157093aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
1571f32d5d43SBarry Smith         r1 = 0.0;
1572f32d5d43SBarry Smith         n  = a->i[i + 1] - a->i[i];
15738e3a54c0SPierre Jolivet         aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
15748e3a54c0SPierre Jolivet         aa = PetscSafePointerPlusOffset(av, a->i[i]);
157593aa15f2SStefano Zampini         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
157693aa15f2SStefano Zampini         if (add) c1[i] += r1;
157793aa15f2SStefano Zampini         else c1[i] = r1;
157893aa15f2SStefano Zampini       }
157993aa15f2SStefano Zampini     } else if (rc == 2) {
158093aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
158193aa15f2SStefano Zampini         r1 = r2 = 0.0;
158293aa15f2SStefano Zampini         n       = a->i[i + 1] - a->i[i];
15838e3a54c0SPierre Jolivet         aj      = PetscSafePointerPlusOffset(a->j, a->i[i]);
15848e3a54c0SPierre Jolivet         aa      = PetscSafePointerPlusOffset(av, a->i[i]);
1585f32d5d43SBarry Smith         for (j = 0; j < n; j++) {
158693aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
158793aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
158893aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
158993aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
1590f32d5d43SBarry Smith         }
159193aa15f2SStefano Zampini         if (add) {
159287753ddeSHong Zhang           c1[i] += r1;
159393aa15f2SStefano Zampini           c2[i] += r2;
159493aa15f2SStefano Zampini         } else {
159593aa15f2SStefano Zampini           c1[i] = r1;
159693aa15f2SStefano Zampini           c2[i] = r2;
1597f32d5d43SBarry Smith         }
159893aa15f2SStefano Zampini       }
159993aa15f2SStefano Zampini     } else {
160093aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
160193aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
160293aa15f2SStefano Zampini         n            = a->i[i + 1] - a->i[i];
16038e3a54c0SPierre Jolivet         aj           = PetscSafePointerPlusOffset(a->j, a->i[i]);
16048e3a54c0SPierre Jolivet         aa           = PetscSafePointerPlusOffset(av, a->i[i]);
160593aa15f2SStefano Zampini         for (j = 0; j < n; j++) {
160693aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
160793aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
160893aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
160993aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
161093aa15f2SStefano Zampini           r3 += aatmp * b3[ajtmp];
161193aa15f2SStefano Zampini         }
161293aa15f2SStefano Zampini         if (add) {
161393aa15f2SStefano Zampini           c1[i] += r1;
161493aa15f2SStefano Zampini           c2[i] += r2;
161593aa15f2SStefano Zampini           c3[i] += r3;
161693aa15f2SStefano Zampini         } else {
161793aa15f2SStefano Zampini           c1[i] = r1;
161893aa15f2SStefano Zampini           c2[i] = r2;
161993aa15f2SStefano Zampini           c3[i] = r3;
162093aa15f2SStefano Zampini         }
162193aa15f2SStefano Zampini       }
162293aa15f2SStefano Zampini     }
1623f32d5d43SBarry Smith   }
16249566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
162593aa15f2SStefano Zampini   if (add) {
16269566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArray(C, &c));
162793aa15f2SStefano Zampini   } else {
16289566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArrayWrite(C, &c));
162993aa15f2SStefano Zampini   }
16309566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(B, &b));
16319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
16323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1633f32d5d43SBarry Smith }
1634f32d5d43SBarry Smith 
MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)1635d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1636d71ae5a4SJacob Faibussowitsch {
1637f32d5d43SBarry Smith   PetscFunctionBegin;
163808401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
163908401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
164008401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
16414324174eSBarry Smith 
16429566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
16433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16449123193aSHong Zhang }
1645b1683b59SHong Zhang 
MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)1646d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1647d71ae5a4SJacob Faibussowitsch {
16484222ddf1SHong Zhang   PetscFunctionBegin;
16494222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16504222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16524222ddf1SHong Zhang }
16534222ddf1SHong Zhang 
16546718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
16556718818eSStefano Zampini 
MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)1656d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1657d71ae5a4SJacob Faibussowitsch {
16584222ddf1SHong Zhang   PetscFunctionBegin;
16596718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16604222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16626718818eSStefano Zampini }
16636718818eSStefano Zampini 
MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)1664d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1665d71ae5a4SJacob Faibussowitsch {
16666718818eSStefano Zampini   PetscFunctionBegin;
16676718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16686718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16704222ddf1SHong Zhang }
16714222ddf1SHong Zhang 
MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)1672d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1673d71ae5a4SJacob Faibussowitsch {
16744222ddf1SHong Zhang   Mat_Product *product = C->product;
16754222ddf1SHong Zhang 
16764222ddf1SHong Zhang   PetscFunctionBegin;
16774222ddf1SHong Zhang   switch (product->type) {
1678d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
1679d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1680d71ae5a4SJacob Faibussowitsch     break;
1681d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
1682d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1683d71ae5a4SJacob Faibussowitsch     break;
1684d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
1685d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1686d71ae5a4SJacob Faibussowitsch     break;
1687d71ae5a4SJacob Faibussowitsch   default:
1688d71ae5a4SJacob Faibussowitsch     break;
16894222ddf1SHong Zhang   }
16903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16914222ddf1SHong Zhang }
16922ef1f0ffSBarry Smith 
MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)1693d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1694d71ae5a4SJacob Faibussowitsch {
16954222ddf1SHong Zhang   Mat_Product *product = C->product;
16964222ddf1SHong Zhang   Mat          A       = product->A;
16974222ddf1SHong Zhang   PetscBool    baij;
16984222ddf1SHong Zhang 
16994222ddf1SHong Zhang   PetscFunctionBegin;
17009566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
17014222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
17024222ddf1SHong Zhang     PetscBool sbaij;
17039566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
170428b400f6SJacob Faibussowitsch     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
17054222ddf1SHong Zhang 
17064222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
17074222ddf1SHong Zhang   } else { /* A is seqbaij */
17084222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
17094222ddf1SHong Zhang   }
17104222ddf1SHong Zhang 
17114222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17134222ddf1SHong Zhang }
17144222ddf1SHong Zhang 
MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)1715d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1716d71ae5a4SJacob Faibussowitsch {
17174222ddf1SHong Zhang   Mat_Product *product = C->product;
17184222ddf1SHong Zhang 
17194222ddf1SHong Zhang   PetscFunctionBegin;
17206718818eSStefano Zampini   MatCheckProduct(C, 1);
172128b400f6SJacob Faibussowitsch   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1722b94d7dedSBarry Smith   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
17231766d9c3SPierre Jolivet   else if (product->type == MATPRODUCT_AtB) {
17241766d9c3SPierre Jolivet     PetscBool flg;
17251766d9c3SPierre Jolivet 
17261766d9c3SPierre Jolivet     PetscCall(PetscObjectTypeCompare((PetscObject)product->A, MATSEQBAIJ, &flg));
17271766d9c3SPierre Jolivet     if (flg) {
17281766d9c3SPierre Jolivet       C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense;
17291766d9c3SPierre Jolivet       C->ops->productsymbolic          = MatProductSymbolic_AtB;
17301766d9c3SPierre Jolivet     }
17311766d9c3SPierre Jolivet   }
17323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17334222ddf1SHong Zhang }
17346718818eSStefano Zampini 
MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)1735d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1736d71ae5a4SJacob Faibussowitsch {
17374222ddf1SHong Zhang   PetscFunctionBegin;
17384222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17394222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17414222ddf1SHong Zhang }
17424222ddf1SHong Zhang 
MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)1743d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1744d71ae5a4SJacob Faibussowitsch {
17454222ddf1SHong Zhang   Mat_Product *product = C->product;
17464222ddf1SHong Zhang 
17474222ddf1SHong Zhang   PetscFunctionBegin;
174848a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
17493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17504222ddf1SHong Zhang }
17514222ddf1SHong Zhang 
MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)1752d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1753d71ae5a4SJacob Faibussowitsch {
17542f5f1f90SHong Zhang   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
17552f5f1f90SHong Zhang   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
17562f5f1f90SHong Zhang   PetscInt     *bi = b->i, *bj = b->j;
17572f5f1f90SHong Zhang   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
17582f5f1f90SHong Zhang   MatScalar    *btval, *btval_den, *ba = b->a;
17592f5f1f90SHong Zhang   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1760c8db22f6SHong Zhang 
1761c8db22f6SHong Zhang   PetscFunctionBegin;
17622f5f1f90SHong Zhang   btval_den = btdense->v;
17639566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(btval_den, m * n));
17642f5f1f90SHong Zhang   for (k = 0; k < ncolors; k++) {
17652f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17662f5f1f90SHong Zhang     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1767d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17682f5f1f90SHong Zhang       btcol = bj + bi[col];
17692f5f1f90SHong Zhang       btval = ba + bi[col];
17702f5f1f90SHong Zhang       anz   = bi[col + 1] - bi[col];
1771d2853540SHong Zhang       for (j = 0; j < anz; j++) {
17722f5f1f90SHong Zhang         brow            = btcol[j];
17732f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1774c8db22f6SHong Zhang       }
1775c8db22f6SHong Zhang     }
17762f5f1f90SHong Zhang     btval_den += m;
1777c8db22f6SHong Zhang   }
17783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1779c8db22f6SHong Zhang }
1780c8db22f6SHong Zhang 
MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)1781d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1782d71ae5a4SJacob Faibussowitsch {
1783b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
17841683a169SBarry Smith   const PetscScalar *ca_den, *ca_den_ptr;
17851683a169SBarry Smith   PetscScalar       *ca = csp->a;
1786f99a636bSHong Zhang   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1787e88777f2SHong Zhang   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1788077f23c2SHong Zhang   PetscInt           nrows, *row, *idx;
1789077f23c2SHong Zhang   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
17908972f759SHong Zhang 
17918972f759SHong Zhang   PetscFunctionBegin;
17929566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1793f99a636bSHong Zhang 
1794077f23c2SHong Zhang   if (brows > 0) {
1795077f23c2SHong Zhang     PetscInt *lstart, row_end, row_start;
1796980ae229SHong Zhang     lstart = matcoloring->lstart;
17979566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(lstart, ncolors));
1798eeb4fd02SHong Zhang 
1799077f23c2SHong Zhang     row_end = brows;
1800eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1801077f23c2SHong Zhang     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1802077f23c2SHong Zhang       ca_den_ptr = ca_den;
1803980ae229SHong Zhang       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1804eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1805eeb4fd02SHong Zhang         row   = rows + colorforrow[k];
1806eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1807eeb4fd02SHong Zhang         for (l = lstart[k]; l < nrows; l++) {
1808eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1809eeb4fd02SHong Zhang             lstart[k] = l;
1810eeb4fd02SHong Zhang             break;
1811eeb4fd02SHong Zhang           } else {
1812077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1813eeb4fd02SHong Zhang           }
1814eeb4fd02SHong Zhang         }
1815077f23c2SHong Zhang         ca_den_ptr += m;
1816eeb4fd02SHong Zhang       }
1817077f23c2SHong Zhang       row_end += brows;
1818eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1819eeb4fd02SHong Zhang     }
1820077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1821077f23c2SHong Zhang     ca_den_ptr = ca_den;
1822077f23c2SHong Zhang     for (k = 0; k < ncolors; k++) {
1823077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1824077f23c2SHong Zhang       row   = rows + colorforrow[k];
1825077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1826ad540459SPierre Jolivet       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1827077f23c2SHong Zhang       ca_den_ptr += m;
1828077f23c2SHong Zhang     }
1829f99a636bSHong Zhang   }
1830f99a636bSHong Zhang 
18319566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1832f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1833077f23c2SHong Zhang   if (matcoloring->brows > 0) {
18349566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1835e88777f2SHong Zhang   } else {
18369566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1837e88777f2SHong Zhang   }
1838f99a636bSHong Zhang #endif
18393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18408972f759SHong Zhang }
18418972f759SHong Zhang 
MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)1842d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1843d71ae5a4SJacob Faibussowitsch {
1844e88777f2SHong Zhang   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
18451a83f524SJed Brown   const PetscInt *is, *ci, *cj, *row_idx;
1846b28d80bdSHong Zhang   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1847b1683b59SHong Zhang   IS             *isa;
1848b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1849e88777f2SHong Zhang   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1850e88777f2SHong Zhang   PetscInt       *colorforcol, *columns, *columns_i, brows;
1851e88777f2SHong Zhang   PetscBool       flg;
1852b1683b59SHong Zhang 
1853b1683b59SHong Zhang   PetscFunctionBegin;
18549566063dSJacob Faibussowitsch   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1855e99be685SHong Zhang 
18567ecccc15SHong Zhang   /* bs > 1 is not being tested yet! */
1857e88777f2SHong Zhang   Nbs       = mat->cmap->N / bs;
1858b1683b59SHong Zhang   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1859e88777f2SHong Zhang   c->N      = Nbs;
1860e88777f2SHong Zhang   c->m      = c->M;
1861b1683b59SHong Zhang   c->rstart = 0;
1862e88777f2SHong Zhang   c->brows  = 100;
1863b1683b59SHong Zhang 
1864b1683b59SHong Zhang   c->ncolors = nis;
18659566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
18669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
18679566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1868e88777f2SHong Zhang 
1869e88777f2SHong Zhang   brows = c->brows;
18709566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1871e88777f2SHong Zhang   if (flg) c->brows = brows;
187248a46eb9SPierre Jolivet   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
18732205254eSKarl Rupp 
1874d2853540SHong Zhang   colorforrow[0] = 0;
1875d2853540SHong Zhang   rows_i         = rows;
1876f99a636bSHong Zhang   den2sp_i       = den2sp;
1877b1683b59SHong Zhang 
18789566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
18799566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(Nbs + 1, &columns));
18802205254eSKarl Rupp 
1881d2853540SHong Zhang   colorforcol[0] = 0;
1882d2853540SHong Zhang   columns_i      = columns;
1883d2853540SHong Zhang 
1884077f23c2SHong Zhang   /* get column-wise storage of mat */
18859566063dSJacob Faibussowitsch   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1886b1683b59SHong Zhang 
1887b28d80bdSHong Zhang   cm = c->m;
18889566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &rowhit));
18899566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1890f99a636bSHong Zhang   for (i = 0; i < nis; i++) { /* loop over color */
18919566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(isa[i], &n));
18929566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isa[i], &is));
18932205254eSKarl Rupp 
1894b1683b59SHong Zhang     c->ncolumns[i] = n;
18951baa6e33SBarry Smith     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1896d2853540SHong Zhang     colorforcol[i + 1] = colorforcol[i] + n;
1897d2853540SHong Zhang     columns_i += n;
1898b1683b59SHong Zhang 
1899b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
19009566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(rowhit, cm));
1901e99be685SHong Zhang 
1902b7caf3d6SHong Zhang     for (j = 0; j < n; j++) { /* loop over columns*/
1903b1683b59SHong Zhang       col     = is[j];
1904d2853540SHong Zhang       row_idx = cj + ci[col];
1905b1683b59SHong Zhang       m       = ci[col + 1] - ci[col];
1906b7caf3d6SHong Zhang       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1907e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1908d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1909b1683b59SHong Zhang       }
1910b1683b59SHong Zhang     }
1911b1683b59SHong Zhang     /* count the number of hits */
1912b1683b59SHong Zhang     nrows = 0;
1913e8354b3eSHong Zhang     for (j = 0; j < cm; j++) {
1914b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1915b1683b59SHong Zhang     }
1916b1683b59SHong Zhang     c->nrows[i]        = nrows;
1917d2853540SHong Zhang     colorforrow[i + 1] = colorforrow[i] + nrows;
1918d2853540SHong Zhang 
1919b1683b59SHong Zhang     nrows = 0;
1920b7caf3d6SHong Zhang     for (j = 0; j < cm; j++) { /* loop over rows */
1921b1683b59SHong Zhang       if (rowhit[j]) {
1922d2853540SHong Zhang         rows_i[nrows]   = j;
192312b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1924b1683b59SHong Zhang         nrows++;
1925b1683b59SHong Zhang       }
1926b1683b59SHong Zhang     }
1927e88777f2SHong Zhang     den2sp_i += nrows;
1928077f23c2SHong Zhang 
19299566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isa[i], &is));
1930f99a636bSHong Zhang     rows_i += nrows;
1931b1683b59SHong Zhang   }
19329566063dSJacob Faibussowitsch   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
19339566063dSJacob Faibussowitsch   PetscCall(PetscFree(rowhit));
19349566063dSJacob Faibussowitsch   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
19352c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1936b28d80bdSHong Zhang 
1937d2853540SHong Zhang   c->colorforrow = colorforrow;
1938d2853540SHong Zhang   c->rows        = rows;
1939f99a636bSHong Zhang   c->den2sp      = den2sp;
1940d2853540SHong Zhang   c->colorforcol = colorforcol;
1941d2853540SHong Zhang   c->columns     = columns;
19422205254eSKarl Rupp 
19439566063dSJacob Faibussowitsch   PetscCall(PetscFree(idxhit));
19443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1945b1683b59SHong Zhang }
1946d013fc79Sandi selinger 
MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)1947d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1948d71ae5a4SJacob Faibussowitsch {
19494222ddf1SHong Zhang   Mat_Product *product = C->product;
19504222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19514222ddf1SHong Zhang 
1952df97dc6dSFande Kong   PetscFunctionBegin;
19534222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19544222ddf1SHong Zhang     /* Alg: "outerproduct" */
19559566063dSJacob Faibussowitsch     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
19564222ddf1SHong Zhang   } else {
19574222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
1958cc1eb50dSBarry Smith     MatProductCtx_MatTransMatMult *atb = (MatProductCtx_MatTransMatMult *)product->data;
19594222ddf1SHong Zhang 
196028b400f6SJacob Faibussowitsch     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
19611cdffd5eSHong Zhang     if (atb->At) {
19621cdffd5eSHong Zhang       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
19631cdffd5eSHong Zhang          user may have called MatProductReplaceMats() to get this A=product->A */
19641cdffd5eSHong Zhang       PetscCall(MatTransposeSetPrecursor(A, atb->At));
19657fb60732SBarry Smith       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
19664222ddf1SHong Zhang     }
19677fb60732SBarry Smith     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
19684222ddf1SHong Zhang   }
19693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1970df97dc6dSFande Kong }
1971df97dc6dSFande Kong 
MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)1972d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1973d71ae5a4SJacob Faibussowitsch {
19744222ddf1SHong Zhang   Mat_Product *product = C->product;
19754222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19764222ddf1SHong Zhang   PetscReal    fill = product->fill;
1977d013fc79Sandi selinger 
1978d013fc79Sandi selinger   PetscFunctionBegin;
19799566063dSJacob Faibussowitsch   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
19802869b61bSandi selinger 
19814222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
19832869b61bSandi selinger }
1984d013fc79Sandi selinger 
MatProductSetFromOptions_SeqAIJ_AB(Mat C)1985d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1986d71ae5a4SJacob Faibussowitsch {
19874222ddf1SHong Zhang   Mat_Product *product = C->product;
19884222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm */
19894222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
19904222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19914222ddf1SHong Zhang   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
19924222ddf1SHong Zhang   PetscInt    nalg        = 7;
19934222ddf1SHong Zhang #else
19944222ddf1SHong Zhang   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
19954222ddf1SHong Zhang   PetscInt    nalg        = 8;
19964222ddf1SHong Zhang #endif
19974222ddf1SHong Zhang 
19984222ddf1SHong Zhang   PetscFunctionBegin;
19994222ddf1SHong Zhang   /* Set default algorithm */
20009566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2001835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2002d013fc79Sandi selinger 
20034222ddf1SHong Zhang   /* Get runtime option */
20044222ddf1SHong Zhang   if (product->api_user) {
2005d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
20069566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
2007d0609cedSBarry Smith     PetscOptionsEnd();
20084222ddf1SHong Zhang   } else {
2009d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
20109566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2011d0609cedSBarry Smith     PetscOptionsEnd();
2012d013fc79Sandi selinger   }
2013835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2014d013fc79Sandi selinger 
20154222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20164222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20184222ddf1SHong Zhang }
2019d013fc79Sandi selinger 
MatProductSetFromOptions_SeqAIJ_AtB(Mat C)2020d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2021d71ae5a4SJacob Faibussowitsch {
20224222ddf1SHong Zhang   Mat_Product *product     = C->product;
20234222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20244222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
2025089a957eSStefano Zampini   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
2026089a957eSStefano Zampini   PetscInt     nalg        = 3;
2027d013fc79Sandi selinger 
20284222ddf1SHong Zhang   PetscFunctionBegin;
20294222ddf1SHong Zhang   /* Get runtime option */
20304222ddf1SHong Zhang   if (product->api_user) {
2031d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
20329566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2033d0609cedSBarry Smith     PetscOptionsEnd();
20344222ddf1SHong Zhang   } else {
2035d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
20369566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2037d0609cedSBarry Smith     PetscOptionsEnd();
20384222ddf1SHong Zhang   }
2039835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2040d013fc79Sandi selinger 
20414222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20434222ddf1SHong Zhang }
20444222ddf1SHong Zhang 
MatProductSetFromOptions_SeqAIJ_ABt(Mat C)2045d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2046d71ae5a4SJacob Faibussowitsch {
20474222ddf1SHong Zhang   Mat_Product *product     = C->product;
20484222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20494222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
20504222ddf1SHong Zhang   const char  *algTypes[2] = {"default", "color"};
20514222ddf1SHong Zhang   PetscInt     nalg        = 2;
20524222ddf1SHong Zhang 
20534222ddf1SHong Zhang   PetscFunctionBegin;
20544222ddf1SHong Zhang   /* Set default algorithm */
20559566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
20564222ddf1SHong Zhang   if (!flg) {
20574222ddf1SHong Zhang     alg = 1;
2058835f2295SStefano Zampini     PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
20594222ddf1SHong Zhang   }
20604222ddf1SHong Zhang 
20614222ddf1SHong Zhang   /* Get runtime option */
20624222ddf1SHong Zhang   if (product->api_user) {
2063d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
20649566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2065d0609cedSBarry Smith     PetscOptionsEnd();
20664222ddf1SHong Zhang   } else {
2067d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
20689566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2069d0609cedSBarry Smith     PetscOptionsEnd();
20704222ddf1SHong Zhang   }
2071835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
20724222ddf1SHong Zhang 
20734222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20744222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20764222ddf1SHong Zhang }
20774222ddf1SHong Zhang 
MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)2078d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2079d71ae5a4SJacob Faibussowitsch {
20804222ddf1SHong Zhang   Mat_Product *product = C->product;
20814222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
20824222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
20834222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20844222ddf1SHong Zhang   const char *algTypes[2] = {"scalable", "rap"};
20854222ddf1SHong Zhang   PetscInt    nalg        = 2;
20864222ddf1SHong Zhang #else
20874222ddf1SHong Zhang   const char *algTypes[3] = {"scalable", "rap", "hypre"};
20884222ddf1SHong Zhang   PetscInt    nalg        = 3;
20894222ddf1SHong Zhang #endif
20904222ddf1SHong Zhang 
20914222ddf1SHong Zhang   PetscFunctionBegin;
20924222ddf1SHong Zhang   /* Set default algorithm */
20939566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2094835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
20954222ddf1SHong Zhang 
20964222ddf1SHong Zhang   /* Get runtime option */
20974222ddf1SHong Zhang   if (product->api_user) {
2098d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
20999566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2100d0609cedSBarry Smith     PetscOptionsEnd();
21014222ddf1SHong Zhang   } else {
2102d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
21039566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2104d0609cedSBarry Smith     PetscOptionsEnd();
21054222ddf1SHong Zhang   }
2106835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
21074222ddf1SHong Zhang 
21084222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21104222ddf1SHong Zhang }
21114222ddf1SHong Zhang 
MatProductSetFromOptions_SeqAIJ_RARt(Mat C)2112d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2113d71ae5a4SJacob Faibussowitsch {
21144222ddf1SHong Zhang   Mat_Product *product     = C->product;
21154222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21164222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21174222ddf1SHong Zhang   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
21184222ddf1SHong Zhang   PetscInt     nalg        = 3;
21194222ddf1SHong Zhang 
21204222ddf1SHong Zhang   PetscFunctionBegin;
21214222ddf1SHong Zhang   /* Set default algorithm */
21229566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2123835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
21244222ddf1SHong Zhang 
21254222ddf1SHong Zhang   /* Get runtime option */
21264222ddf1SHong Zhang   if (product->api_user) {
2127d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
21289566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2129d0609cedSBarry Smith     PetscOptionsEnd();
21304222ddf1SHong Zhang   } else {
2131d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
21329566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2133d0609cedSBarry Smith     PetscOptionsEnd();
21344222ddf1SHong Zhang   }
2135835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
21364222ddf1SHong Zhang 
21374222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21394222ddf1SHong Zhang }
21404222ddf1SHong Zhang 
21414222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
MatProductSetFromOptions_SeqAIJ_ABC(Mat C)2142d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2143d71ae5a4SJacob Faibussowitsch {
21444222ddf1SHong Zhang   Mat_Product *product     = C->product;
21454222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21464222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21474222ddf1SHong Zhang   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
21484222ddf1SHong Zhang   PetscInt     nalg        = 7;
21494222ddf1SHong Zhang 
21504222ddf1SHong Zhang   PetscFunctionBegin;
21514222ddf1SHong Zhang   /* Set default algorithm */
21529566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2153835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
21544222ddf1SHong Zhang 
21554222ddf1SHong Zhang   /* Get runtime option */
21564222ddf1SHong Zhang   if (product->api_user) {
2157d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
21589566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2159d0609cedSBarry Smith     PetscOptionsEnd();
21604222ddf1SHong Zhang   } else {
2161d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
21629566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2163d0609cedSBarry Smith     PetscOptionsEnd();
21644222ddf1SHong Zhang   }
2165835f2295SStefano Zampini   if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
21664222ddf1SHong Zhang 
21674222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21684222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21704222ddf1SHong Zhang }
21714222ddf1SHong Zhang 
MatProductSetFromOptions_SeqAIJ(Mat C)2172d71ae5a4SJacob Faibussowitsch PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2173d71ae5a4SJacob Faibussowitsch {
21744222ddf1SHong Zhang   Mat_Product *product = C->product;
21754222ddf1SHong Zhang 
21764222ddf1SHong Zhang   PetscFunctionBegin;
21774222ddf1SHong Zhang   switch (product->type) {
2178d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
2179d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2180d71ae5a4SJacob Faibussowitsch     break;
2181d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
2182d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2183d71ae5a4SJacob Faibussowitsch     break;
2184d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
2185d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2186d71ae5a4SJacob Faibussowitsch     break;
2187d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_PtAP:
2188d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2189d71ae5a4SJacob Faibussowitsch     break;
2190d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_RARt:
2191d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2192d71ae5a4SJacob Faibussowitsch     break;
2193d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABC:
2194d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2195d71ae5a4SJacob Faibussowitsch     break;
2196d71ae5a4SJacob Faibussowitsch   default:
2197d71ae5a4SJacob Faibussowitsch     break;
21984222ddf1SHong Zhang   }
21993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2200d013fc79Sandi selinger }
2201