1632d0f97SHong Zhang /*
2632d0f97SHong Zhang Routines to compute overlapping regions of a parallel MPI matrix.
3632d0f97SHong Zhang Used for finding submatrices that were shared across processors.
4632d0f97SHong Zhang */
5c6db04a5SJed Brown #include <../src/mat/impls/sbaij/mpi/mpisbaij.h>
6c6db04a5SJed Brown #include <petscbt.h>
7632d0f97SHong Zhang
813f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat, PetscInt, IS *);
913f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat, PetscInt *, PetscInt, PetscInt *, PetscBT *);
10632d0f97SHong Zhang
MatIncreaseOverlap_MPISBAIJ(Mat C,PetscInt is_max,IS is[],PetscInt ov)11d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C, PetscInt is_max, IS is[], PetscInt ov)
12d71ae5a4SJacob Faibussowitsch {
13b11de2a9SHong Zhang PetscInt i, N = C->cmap->N, bs = C->rmap->bs, M = C->rmap->N, Mbs = M / bs, *nidx, isz, iov;
14b11de2a9SHong Zhang IS *is_new, *is_row;
15b11de2a9SHong Zhang Mat *submats;
16b11de2a9SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data;
17b11de2a9SHong Zhang Mat_SeqSBAIJ *asub_i;
18b11de2a9SHong Zhang PetscBT table;
19f1957bc3SPierre Jolivet PetscInt *ai, brow, nz, nis, l, nmax, nstages, max_no, pos;
20b11de2a9SHong Zhang const PetscInt *idx;
21b829d874SHong Zhang PetscBool flg;
22632d0f97SHong Zhang
23632d0f97SHong Zhang PetscFunctionBegin;
249566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(is_max, &is_new));
25632d0f97SHong Zhang /* Convert the indices into block format */
269566063dSJacob Faibussowitsch PetscCall(ISCompressIndicesGeneral(N, C->rmap->n, bs, is_max, is, is_new));
2708401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
28db41cccfSHong Zhang
292ef1f0ffSBarry Smith /* previous non-scalable implementation */
30a0769a71SSatish Balay flg = PETSC_FALSE;
319566063dSJacob Faibussowitsch PetscCall(PetscOptionsHasName(NULL, NULL, "-IncreaseOverlap_old", &flg));
327a868f3eSHong Zhang if (flg) { /* previous non-scalable implementation */
337a868f3eSHong Zhang printf("use previous non-scalable implementation...\n");
3448a46eb9SPierre Jolivet for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPISBAIJ_Once(C, is_max, is_new));
351c4982ebSHong Zhang } else { /* implementation using modified BAIJ routines */
361c4982ebSHong Zhang
379566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(Mbs + 1, &nidx));
389566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(Mbs, &table)); /* for column search */
39db41cccfSHong Zhang
40db41cccfSHong Zhang /* Create is_row */
419566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(is_max, &is_row));
429566063dSJacob Faibussowitsch PetscCall(ISCreateStride(PETSC_COMM_SELF, Mbs, 0, 1, &is_row[0]));
4326fbe8dcSKarl Rupp
44ac530a7eSPierre Jolivet for (i = 1; i < is_max; i++) is_row[i] = is_row[0]; /* reuse is_row[0] */
45db41cccfSHong Zhang
467dae84e0SHong Zhang /* Allocate memory to hold all the submatrices - Modified from MatCreateSubMatrices_MPIBAIJ() */
479566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(is_max + 1, &submats));
48b11de2a9SHong Zhang
49db41cccfSHong Zhang /* Determine the number of stages through which submatrices are done */
50db41cccfSHong Zhang nmax = 20 * 1000000 / (c->Nbs * sizeof(PetscInt));
51db41cccfSHong Zhang if (!nmax) nmax = 1;
52f1957bc3SPierre Jolivet nstages = is_max / nmax + ((is_max % nmax) ? 1 : 0);
53db41cccfSHong Zhang
54db41cccfSHong Zhang /* Make sure every processor loops through the nstages */
55f1957bc3SPierre Jolivet PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &nstages, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
56b11de2a9SHong Zhang
57c6a7a370SJeremy L Thompson {
58c6a7a370SJeremy L Thompson const PetscObject obj = (PetscObject)c->A;
59c6a7a370SJeremy L Thompson size_t new_len, cur_len, max_len;
60c6a7a370SJeremy L Thompson
61c6a7a370SJeremy L Thompson PetscCall(PetscStrlen(MATSEQBAIJ, &new_len));
62c6a7a370SJeremy L Thompson PetscCall(PetscStrlen(MATSEQSBAIJ, &cur_len));
63c6a7a370SJeremy L Thompson max_len = PetscMax(cur_len, new_len) + 1;
64f4f49eeaSPierre Jolivet PetscCall(PetscRealloc(max_len * sizeof(*obj->type_name), &obj->type_name));
65c6a7a370SJeremy L Thompson /* The resulting submatrices should be BAIJ, not SBAIJ, hence we change this value to
66c6a7a370SJeremy L Thompson trigger that */
67b11de2a9SHong Zhang for (iov = 0; iov < ov; ++iov) {
68b11de2a9SHong Zhang /* 1) Get submats for column search */
69c6a7a370SJeremy L Thompson PetscCall(PetscStrncpy(obj->type_name, MATSEQBAIJ, max_len));
70db41cccfSHong Zhang for (i = 0, pos = 0; i < nstages; i++) {
71db41cccfSHong Zhang if (pos + nmax <= is_max) max_no = nmax;
72db41cccfSHong Zhang else if (pos == is_max) max_no = 0;
73db41cccfSHong Zhang else max_no = is_max - pos;
74b829d874SHong Zhang c->ijonly = PETSC_TRUE; /* only matrix data structures are requested */
75c6a7a370SJeremy L Thompson
76fdfbdca6SPierre Jolivet PetscCall(MatCreateSubMatrices_MPIBAIJ_local(C, max_no, is_row + pos, is_new + pos, MAT_INITIAL_MATRIX, submats + pos, PETSC_TRUE));
77db41cccfSHong Zhang pos += max_no;
78db41cccfSHong Zhang }
79c6a7a370SJeremy L Thompson PetscCall(PetscStrncpy(obj->type_name, MATSEQSBAIJ, max_len));
80db41cccfSHong Zhang
81b11de2a9SHong Zhang /* 2) Row search */
829566063dSJacob Faibussowitsch PetscCall(MatIncreaseOverlap_MPIBAIJ_Once(C, is_max, is_new));
83db41cccfSHong Zhang
84b11de2a9SHong Zhang /* 3) Column search */
85db41cccfSHong Zhang for (i = 0; i < is_max; i++) {
86db41cccfSHong Zhang asub_i = (Mat_SeqSBAIJ *)submats[i]->data;
87feb237baSPierre Jolivet ai = asub_i->i;
88db41cccfSHong Zhang
89db41cccfSHong Zhang /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */
909566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(Mbs, table));
91db41cccfSHong Zhang
929566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is_new[i], &idx));
939566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is_new[i], &nis));
94db41cccfSHong Zhang for (l = 0; l < nis; l++) {
959566063dSJacob Faibussowitsch PetscCall(PetscBTSet(table, idx[l]));
96db41cccfSHong Zhang nidx[l] = idx[l];
97db41cccfSHong Zhang }
98db41cccfSHong Zhang isz = nis;
99db41cccfSHong Zhang
100b11de2a9SHong Zhang /* add column entries to table */
101db41cccfSHong Zhang for (brow = 0; brow < Mbs; brow++) {
102db41cccfSHong Zhang nz = ai[brow + 1] - ai[brow];
103db41cccfSHong Zhang if (nz) {
104db41cccfSHong Zhang if (!PetscBTLookupSet(table, brow)) nidx[isz++] = brow;
105db41cccfSHong Zhang }
106db41cccfSHong Zhang }
1079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is_new[i], &idx));
1089566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is_new[i]));
109db41cccfSHong Zhang
110db41cccfSHong Zhang /* create updated is_new */
1119566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz, nidx, PETSC_COPY_VALUES, is_new + i));
112db41cccfSHong Zhang }
113db41cccfSHong Zhang
114db41cccfSHong Zhang /* Free tmp spaces */
11548a46eb9SPierre Jolivet for (i = 0; i < is_max; i++) PetscCall(MatDestroy(&submats[i]));
116db41cccfSHong Zhang }
117b829d874SHong Zhang
1189566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table));
1199566063dSJacob Faibussowitsch PetscCall(PetscFree(submats));
1209566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is_row[0]));
1219566063dSJacob Faibussowitsch PetscCall(PetscFree(is_row));
1229566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx));
123db41cccfSHong Zhang }
124c6a7a370SJeremy L Thompson }
125bd38d1d0SPierre Jolivet for (i = 0; i < is_max; i++) {
126bd38d1d0SPierre Jolivet PetscCall(ISDestroy(&is[i]));
127bd38d1d0SPierre Jolivet PetscCall(ISGetLocalSize(is_new[i], &nis));
128bd38d1d0SPierre Jolivet PetscCall(ISGetIndices(is_new[i], &idx));
129bd38d1d0SPierre Jolivet PetscCall(ISCreateBlock(PetscObjectComm((PetscObject)is_new[i]), bs, nis, idx, PETSC_COPY_VALUES, &is[i]));
130bd38d1d0SPierre Jolivet PetscCall(ISDestroy(&is_new[i]));
131bd38d1d0SPierre Jolivet }
1329566063dSJacob Faibussowitsch PetscCall(PetscFree(is_new));
1333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
134632d0f97SHong Zhang }
135632d0f97SHong Zhang
1369371c9d4SSatish Balay typedef enum {
1379371c9d4SSatish Balay MINE,
1389371c9d4SSatish Balay OTHER
1399371c9d4SSatish Balay } WhoseOwner;
1400472cc68SHong Zhang /* data1, odata1 and odata2 are packed in the format (for communication):
141a2a9f22aSHong Zhang data[0] = is_max, no of is
142a2a9f22aSHong Zhang data[1] = size of is[0]
143a2a9f22aSHong Zhang ...
144a2a9f22aSHong Zhang data[is_max] = size of is[is_max-1]
145a2a9f22aSHong Zhang data[is_max + 1] = data(is[0])
146a2a9f22aSHong Zhang ...
147a2a9f22aSHong Zhang data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i])
148a2a9f22aSHong Zhang ...
1490472cc68SHong Zhang data2 is packed in the format (for creating output is[]):
1500472cc68SHong Zhang data[0] = is_max, no of is
1510472cc68SHong Zhang data[1] = size of is[0]
1520472cc68SHong Zhang ...
1530472cc68SHong Zhang data[is_max] = size of is[is_max-1]
1540472cc68SHong Zhang data[is_max + 1] = data(is[0])
1550472cc68SHong Zhang ...
1560472cc68SHong Zhang data[is_max + 1 + Mbs*i) = data(is[i])
1570472cc68SHong Zhang ...
158a2a9f22aSHong Zhang */
MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[])159d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C, PetscInt is_max, IS is[])
160d71ae5a4SJacob Faibussowitsch {
161632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data;
1626497c311SBarry Smith PetscMPIInt proc_end = 0, size, rank, tag1, tag2, *len_s, nrqr, nrqs, *id_r1, *len_r1, flag, *iwork;
1635d0c19d7SBarry Smith const PetscInt *idx_i;
1646497c311SBarry Smith PetscInt idx, isz, col, *n, *data1, **data1_start, *data2, *data2_i, *data, *data_i, len;
16526fbe8dcSKarl Rupp PetscInt Mbs, i, j, k, *odata1, *odata2;
1666497c311SBarry Smith PetscInt **odata2_ptr, *ctable = NULL, *btable, len_max, len_est;
1676497c311SBarry Smith PetscInt len_unused, nodata2;
16813f74950SBarry Smith PetscInt ois_max; /* max no of is[] in each of processor */
169*73bc78fdSLisandro Dalcin PetscByte *t_p;
170632d0f97SHong Zhang MPI_Comm comm;
171e8527bf2SHong Zhang MPI_Request *s_waits1, *s_waits2, r_req;
172632d0f97SHong Zhang MPI_Status *s_status, r_status;
17345f43ab7SHong Zhang PetscBT *table; /* mark indices of this processor's is[] */
174fc70d14eSHong Zhang PetscBT table_i;
175bfc6803cSHong Zhang PetscBT otable; /* mark indices of other processors' is[] */
176d0f46423SBarry Smith PetscInt bs = C->rmap->bs, Bn = c->B->cmap->n, Bnbs = Bn / bs, *Bowners;
17710eea884SHong Zhang IS garray_local, garray_gl;
1785483b11dSHong Zhang
179632d0f97SHong Zhang PetscFunctionBegin;
1809566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
181632d0f97SHong Zhang size = c->size;
182632d0f97SHong Zhang rank = c->rank;
183632d0f97SHong Zhang Mbs = c->Mbs;
184632d0f97SHong Zhang
1859566063dSJacob Faibussowitsch PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
1869566063dSJacob Faibussowitsch PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
187c910923dSHong Zhang
188430a0127SHong Zhang /* create tables used in
189430a0127SHong Zhang step 1: table[i] - mark c->garray of proc [i]
19045f43ab7SHong Zhang step 3: table[i] - mark indices of is[i] when whose=MINE
191430a0127SHong Zhang table[0] - mark incideces of is[] when whose=OTHER */
1922f613bf5SBarry Smith len = PetscMax(is_max, size);
1939566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(len, &table, (Mbs / PETSC_BITS_PER_BYTE + 1) * len, &t_p));
194ad540459SPierre Jolivet for (i = 0; i < len; i++) table[i] = t_p + (Mbs / PETSC_BITS_PER_BYTE + 1) * i;
195430a0127SHong Zhang
196462c564dSBarry Smith PetscCallMPI(MPIU_Allreduce(&is_max, &ois_max, 1, MPIU_INT, MPI_MAX, comm));
19710eea884SHong Zhang
1985483b11dSHong Zhang /* 1. Send this processor's is[] to other processors */
199e8527bf2SHong Zhang /* allocate spaces */
2009566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(is_max, &n));
2015483b11dSHong Zhang len = 0;
202c910923dSHong Zhang for (i = 0; i < is_max; i++) {
2039566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n[i]));
204632d0f97SHong Zhang len += n[i];
205632d0f97SHong Zhang }
206958c9bccSBarry Smith if (!len) {
2075483b11dSHong Zhang is_max = 0;
2085483b11dSHong Zhang } else {
2095483b11dSHong Zhang len += 1 + is_max; /* max length of data1 for one processor */
2105483b11dSHong Zhang }
2115483b11dSHong Zhang
2129566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(size * len + 1, &data1));
2139566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(size, &data1_start));
2145483b11dSHong Zhang for (i = 0; i < size; i++) data1_start[i] = data1 + i * len;
2155483b11dSHong Zhang
2169566063dSJacob Faibussowitsch PetscCall(PetscMalloc4(size, &len_s, size, &btable, size, &iwork, size + 1, &Bowners));
217e8527bf2SHong Zhang
21876f244e2SHong Zhang /* gather c->garray from all processors */
2199566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(comm, Bnbs, c->garray, PETSC_COPY_VALUES, &garray_local));
2209566063dSJacob Faibussowitsch PetscCall(ISAllGather(garray_local, &garray_gl));
2219566063dSJacob Faibussowitsch PetscCall(ISDestroy(&garray_local));
2229566063dSJacob Faibussowitsch PetscCallMPI(MPI_Allgather(&Bnbs, 1, MPIU_INT, Bowners + 1, 1, MPIU_INT, comm));
22326fbe8dcSKarl Rupp
22476f244e2SHong Zhang Bowners[0] = 0;
22576f244e2SHong Zhang for (i = 0; i < size; i++) Bowners[i + 1] += Bowners[i];
22676f244e2SHong Zhang
2275483b11dSHong Zhang if (is_max) {
228430a0127SHong Zhang /* hash table ctable which maps c->row to proc_id) */
2299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(Mbs, &ctable));
230a3f1d042SPierre Jolivet j = 0;
231a3f1d042SPierre Jolivet for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) {
23226fbe8dcSKarl Rupp for (; j < C->rmap->range[proc_id + 1] / bs; j++) ctable[j] = proc_id;
2335483b11dSHong Zhang }
2345483b11dSHong Zhang
235430a0127SHong Zhang /* hash tables marking c->garray */
2369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(garray_gl, &idx_i));
2375483b11dSHong Zhang for (i = 0; i < size; i++) {
238430a0127SHong Zhang table_i = table[i];
2399566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(Mbs, table_i));
240430a0127SHong Zhang for (j = Bowners[i]; j < Bowners[i + 1]; j++) { /* go through B cols of proc[i]*/
2419566063dSJacob Faibussowitsch PetscCall(PetscBTSet(table_i, idx_i[j]));
2425483b11dSHong Zhang }
2435483b11dSHong Zhang }
2449566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(garray_gl, &idx_i));
2455483b11dSHong Zhang } /* if (is_max) */
2469566063dSJacob Faibussowitsch PetscCall(ISDestroy(&garray_gl));
2475483b11dSHong Zhang
2485483b11dSHong Zhang /* evaluate communication - mesg to who, length, and buffer space */
249e8527bf2SHong Zhang for (i = 0; i < size; i++) len_s[i] = 0;
2505483b11dSHong Zhang
2515483b11dSHong Zhang /* header of data1 */
2526497c311SBarry Smith for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) {
2535483b11dSHong Zhang iwork[proc_id] = 0;
2545483b11dSHong Zhang *data1_start[proc_id] = is_max;
2555483b11dSHong Zhang data1_start[proc_id]++;
2565483b11dSHong Zhang for (j = 0; j < is_max; j++) {
2575483b11dSHong Zhang if (proc_id == rank) {
2585483b11dSHong Zhang *data1_start[proc_id] = n[j];
2595483b11dSHong Zhang } else {
2605483b11dSHong Zhang *data1_start[proc_id] = 0;
2615483b11dSHong Zhang }
2625483b11dSHong Zhang data1_start[proc_id]++;
2635483b11dSHong Zhang }
2645483b11dSHong Zhang }
2655483b11dSHong Zhang
2665483b11dSHong Zhang for (i = 0; i < is_max; i++) {
2679566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx_i));
2685483b11dSHong Zhang for (j = 0; j < n[i]; j++) {
2695483b11dSHong Zhang idx = idx_i[j];
2709371c9d4SSatish Balay *data1_start[rank] = idx;
27135cb6cd3SPierre Jolivet data1_start[rank]++; /* for local processing */
2726497c311SBarry Smith PetscCall(PetscMPIIntCast(ctable[idx], &proc_end));
2736497c311SBarry Smith for (PetscMPIInt proc_id = 0; proc_id <= proc_end; proc_id++) { /* for others to process */
274e8527bf2SHong Zhang if (proc_id == rank) continue; /* done before this loop */
27526fbe8dcSKarl Rupp if (proc_id < proc_end && !PetscBTLookup(table[proc_id], idx)) continue; /* no need for sending idx to [proc_id] */
2769371c9d4SSatish Balay *data1_start[proc_id] = idx;
2779371c9d4SSatish Balay data1_start[proc_id]++;
2785483b11dSHong Zhang len_s[proc_id]++;
2795483b11dSHong Zhang }
2805483b11dSHong Zhang }
2815483b11dSHong Zhang /* update header data */
2826497c311SBarry Smith for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) {
2835483b11dSHong Zhang if (proc_id == rank) continue;
2845483b11dSHong Zhang *(data1 + proc_id * len + 1 + i) = len_s[proc_id] - iwork[proc_id];
2855483b11dSHong Zhang iwork[proc_id] = len_s[proc_id];
2865483b11dSHong Zhang }
2879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx_i));
288e8527bf2SHong Zhang }
2895483b11dSHong Zhang
2909371c9d4SSatish Balay nrqs = 0;
2919371c9d4SSatish Balay nrqr = 0;
2925483b11dSHong Zhang for (i = 0; i < size; i++) {
2935483b11dSHong Zhang data1_start[i] = data1 + i * len;
2945483b11dSHong Zhang if (len_s[i]) {
2955483b11dSHong Zhang nrqs++;
2965483b11dSHong Zhang len_s[i] += 1 + is_max; /* add no. of header msg */
2975483b11dSHong Zhang }
2985483b11dSHong Zhang }
2995483b11dSHong Zhang
30048a46eb9SPierre Jolivet for (i = 0; i < is_max; i++) PetscCall(ISDestroy(&is[i]));
3019566063dSJacob Faibussowitsch PetscCall(PetscFree(n));
3029566063dSJacob Faibussowitsch PetscCall(PetscFree(ctable));
3035483b11dSHong Zhang
3045483b11dSHong Zhang /* Determine the number of messages to expect, their lengths, from from-ids */
3059566063dSJacob Faibussowitsch PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrqr));
3069566063dSJacob Faibussowitsch PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, len_s, &id_r1, &len_r1));
307632d0f97SHong Zhang
308632d0f97SHong Zhang /* Now post the sends */
3099566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(size, &s_waits1, size, &s_waits2));
310632d0f97SHong Zhang k = 0;
3116497c311SBarry Smith for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) { /* send data1 to processor [proc_id] */
3125483b11dSHong Zhang if (len_s[proc_id]) {
3136497c311SBarry Smith PetscCallMPI(MPIU_Isend(data1_start[proc_id], len_s[proc_id], MPIU_INT, proc_id, tag1, comm, s_waits1 + k));
314632d0f97SHong Zhang k++;
315632d0f97SHong Zhang }
316632d0f97SHong Zhang }
317632d0f97SHong Zhang
31845f43ab7SHong Zhang /* 2. Receive other's is[] and process. Then send back */
3195483b11dSHong Zhang len = 0;
3205483b11dSHong Zhang for (i = 0; i < nrqr; i++) {
3215483b11dSHong Zhang if (len_r1[i] > len) len = len_r1[i];
3225483b11dSHong Zhang }
3239566063dSJacob Faibussowitsch PetscCall(PetscFree(len_r1));
3249566063dSJacob Faibussowitsch PetscCall(PetscFree(id_r1));
32545f43ab7SHong Zhang
3266497c311SBarry Smith for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0;
32745f43ab7SHong Zhang
3289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(len + 1, &odata1));
3299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(size, &odata2_ptr));
3309566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(Mbs, &otable));
33110eea884SHong Zhang
33210eea884SHong Zhang len_max = ois_max * (Mbs + 1); /* max space storing all is[] for each receive */
333240e5896SHong Zhang len_est = 2 * len_max; /* estimated space of storing is[] for all receiving messages */
3349566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(len_est + 1, &odata2));
33510eea884SHong Zhang nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */
33626fbe8dcSKarl Rupp
337240e5896SHong Zhang odata2_ptr[nodata2] = odata2;
33826fbe8dcSKarl Rupp
33910eea884SHong Zhang len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max */
34010eea884SHong Zhang
341632d0f97SHong Zhang k = 0;
3425483b11dSHong Zhang while (k < nrqr) {
3436497c311SBarry Smith PetscMPIInt ilen;
3446497c311SBarry Smith
345632d0f97SHong Zhang /* Receive messages */
3469566063dSJacob Faibussowitsch PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag1, comm, &flag, &r_status));
347632d0f97SHong Zhang if (flag) {
3486497c311SBarry Smith PetscMPIInt proc_id;
3496497c311SBarry Smith
3506497c311SBarry Smith PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &ilen));
351632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE;
3526497c311SBarry Smith PetscCallMPI(MPIU_Irecv(odata1, ilen, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req));
3539566063dSJacob Faibussowitsch PetscCallMPI(MPI_Wait(&r_req, &r_status));
354632d0f97SHong Zhang
355fc70d14eSHong Zhang /* Process messages */
356240e5896SHong Zhang /* make sure there is enough unused space in odata2 array */
35710eea884SHong Zhang if (len_unused < len_max) { /* allocate more space for odata2 */
3589566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(len_est + 1, &odata2));
359240e5896SHong Zhang odata2_ptr[++nodata2] = odata2;
36010eea884SHong Zhang len_unused = len_est;
36110eea884SHong Zhang }
36210eea884SHong Zhang
3639566063dSJacob Faibussowitsch PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, odata1, OTHER, odata2, &otable));
364a2a9f22aSHong Zhang len = 1 + odata2[0];
36526fbe8dcSKarl Rupp for (i = 0; i < odata2[0]; i++) len += odata2[1 + i];
366632d0f97SHong Zhang
367632d0f97SHong Zhang /* Send messages back */
3686497c311SBarry Smith PetscCallMPI(MPIU_Isend(odata2, len, MPIU_INT, proc_id, tag2, comm, s_waits2 + k));
369fc70d14eSHong Zhang k++;
37010eea884SHong Zhang odata2 += len;
37110eea884SHong Zhang len_unused -= len;
3726497c311SBarry Smith PetscCall(PetscMPIIntCast(len, &len_s[proc_id])); /* length of message sending back to proc_id */
373632d0f97SHong Zhang }
3745483b11dSHong Zhang }
3759566063dSJacob Faibussowitsch PetscCall(PetscFree(odata1));
3769566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&otable));
377632d0f97SHong Zhang
37845f43ab7SHong Zhang /* 3. Do local work on this processor's is[] */
37945f43ab7SHong Zhang /* make sure there is enough unused space in odata2(=data) array */
38045f43ab7SHong Zhang len_max = is_max * (Mbs + 1); /* max space storing all is[] for this processor */
38110eea884SHong Zhang if (len_unused < len_max) { /* allocate more space for odata2 */
3829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(len_est + 1, &odata2));
38326fbe8dcSKarl Rupp
384240e5896SHong Zhang odata2_ptr[++nodata2] = odata2;
38510eea884SHong Zhang }
386bfc6803cSHong Zhang
387240e5896SHong Zhang data = odata2;
3889566063dSJacob Faibussowitsch PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, data1_start[rank], MINE, data, table));
3899566063dSJacob Faibussowitsch PetscCall(PetscFree(data1_start));
39045f43ab7SHong Zhang
39145f43ab7SHong Zhang /* 4. Receive work done on other processors, then merge */
39245f43ab7SHong Zhang /* get max number of messages that this processor expects to recv */
393462c564dSBarry Smith PetscCallMPI(MPIU_Allreduce(len_s, iwork, size, MPI_INT, MPI_MAX, comm));
3949566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(iwork[rank] + 1, &data2));
3959566063dSJacob Faibussowitsch PetscCall(PetscFree4(len_s, btable, iwork, Bowners));
39645f43ab7SHong Zhang
397632d0f97SHong Zhang k = 0;
3985483b11dSHong Zhang while (k < nrqs) {
399632d0f97SHong Zhang /* Receive messages */
4009566063dSJacob Faibussowitsch PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag2, comm, &flag, &r_status));
401632d0f97SHong Zhang if (flag) {
4026497c311SBarry Smith PetscMPIInt proc_id, ilen;
4036497c311SBarry Smith PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &ilen));
404632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE;
4056497c311SBarry Smith PetscCallMPI(MPIU_Irecv(data2, ilen, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req));
4069566063dSJacob Faibussowitsch PetscCallMPI(MPI_Wait(&r_req, &r_status));
4076497c311SBarry Smith if (ilen > 1 + is_max) { /* Add data2 into data */
4080472cc68SHong Zhang data2_i = data2 + 1 + is_max;
409fc70d14eSHong Zhang for (i = 0; i < is_max; i++) {
410fc70d14eSHong Zhang table_i = table[i];
411bfc6803cSHong Zhang data_i = data + 1 + is_max + Mbs * i;
412bfc6803cSHong Zhang isz = data[1 + i];
4130472cc68SHong Zhang for (j = 0; j < data2[1 + i]; j++) {
4140472cc68SHong Zhang col = data2_i[j];
41526fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, col)) data_i[isz++] = col;
416fc70d14eSHong Zhang }
417bfc6803cSHong Zhang data[1 + i] = isz;
4180472cc68SHong Zhang if (i < is_max - 1) data2_i += data2[1 + i];
419fc70d14eSHong Zhang }
4205483b11dSHong Zhang }
421632d0f97SHong Zhang k++;
422632d0f97SHong Zhang }
4235483b11dSHong Zhang }
4249566063dSJacob Faibussowitsch PetscCall(PetscFree(data2));
4259566063dSJacob Faibussowitsch PetscCall(PetscFree2(table, t_p));
4265483b11dSHong Zhang
4275483b11dSHong Zhang /* phase 1 sends are complete */
4289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(size, &s_status));
4299566063dSJacob Faibussowitsch if (nrqs) PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status));
4309566063dSJacob Faibussowitsch PetscCall(PetscFree(data1));
4315483b11dSHong Zhang
432240e5896SHong Zhang /* phase 2 sends are complete */
4339566063dSJacob Faibussowitsch if (nrqr) PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status));
4349566063dSJacob Faibussowitsch PetscCall(PetscFree2(s_waits1, s_waits2));
4359566063dSJacob Faibussowitsch PetscCall(PetscFree(s_status));
436632d0f97SHong Zhang
437c910923dSHong Zhang /* 5. Create new is[] */
438c910923dSHong Zhang for (i = 0; i < is_max; i++) {
439bfc6803cSHong Zhang data_i = data + 1 + is_max + Mbs * i;
4409566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, data[1 + i], data_i, PETSC_COPY_VALUES, is + i));
441632d0f97SHong Zhang }
44248a46eb9SPierre Jolivet for (k = 0; k <= nodata2; k++) PetscCall(PetscFree(odata2_ptr[k]));
4439566063dSJacob Faibussowitsch PetscCall(PetscFree(odata2_ptr));
4443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
445632d0f97SHong Zhang }
446632d0f97SHong Zhang
447632d0f97SHong Zhang /*
448dc008846SHong Zhang MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do
449632d0f97SHong Zhang the work on the local processor.
450632d0f97SHong Zhang
451632d0f97SHong Zhang Inputs:
452632d0f97SHong Zhang C - MAT_MPISBAIJ;
453bfc6803cSHong Zhang data - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format.
454bfc6803cSHong Zhang whose - whose is[] to be processed,
455bfc6803cSHong Zhang MINE: this processor's is[]
456bfc6803cSHong Zhang OTHER: other processor's is[]
457632d0f97SHong Zhang Output:
45810eea884SHong Zhang nidx - whose = MINE:
4590472cc68SHong Zhang holds input and newly found indices in the same format as data
4600472cc68SHong Zhang whose = OTHER:
4610472cc68SHong Zhang only holds the newly found indices
4620472cc68SHong Zhang table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'.
463632d0f97SHong Zhang */
46476f244e2SHong Zhang /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */
MatIncreaseOverlap_MPISBAIJ_Local(Mat C,PetscInt * data,PetscInt whose,PetscInt * nidx,PetscBT * table)465d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C, PetscInt *data, PetscInt whose, PetscInt *nidx, PetscBT *table)
466d71ae5a4SJacob Faibussowitsch {
467632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data;
468f4f49eeaSPierre Jolivet Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)c->A->data;
469f4f49eeaSPierre Jolivet Mat_SeqBAIJ *b = (Mat_SeqBAIJ *)c->B->data;
47013f74950SBarry Smith PetscInt row, mbs, Mbs, *nidx_i, col, col_max, isz, isz0, *ai, *aj, *bi, *bj, *garray, rstart, l;
47113f74950SBarry Smith PetscInt a_start, a_end, b_start, b_end, i, j, k, is_max, *idx_i, n;
472bfc6803cSHong Zhang PetscBT table0; /* mark the indices of input is[] for look up */
473da81f932SPierre Jolivet PetscBT table_i; /* points to i-th table. When whose=OTHER, a single table is used for all is[] */
474632d0f97SHong Zhang
475632d0f97SHong Zhang PetscFunctionBegin;
4769371c9d4SSatish Balay Mbs = c->Mbs;
4779371c9d4SSatish Balay mbs = a->mbs;
4789371c9d4SSatish Balay ai = a->i;
4799371c9d4SSatish Balay aj = a->j;
4809371c9d4SSatish Balay bi = b->i;
4819371c9d4SSatish Balay bj = b->j;
482632d0f97SHong Zhang garray = c->garray;
483899cda47SBarry Smith rstart = c->rstartbs;
484dc008846SHong Zhang is_max = data[0];
485632d0f97SHong Zhang
4869566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(Mbs, &table0));
487fc70d14eSHong Zhang
488fc70d14eSHong Zhang nidx[0] = is_max;
489fc70d14eSHong Zhang idx_i = data + is_max + 1; /* ptr to input is[0] array */
490bfc6803cSHong Zhang nidx_i = nidx + is_max + 1; /* ptr to output is[0] array */
491dc008846SHong Zhang for (i = 0; i < is_max; i++) { /* for each is */
492dc008846SHong Zhang isz = 0;
493fc70d14eSHong Zhang n = data[1 + i]; /* size of input is[i] */
494dc008846SHong Zhang
49576f244e2SHong Zhang /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */
496bfc6803cSHong Zhang if (whose == MINE) { /* process this processor's is[] */
497bfc6803cSHong Zhang table_i = table[i];
4980472cc68SHong Zhang nidx_i = nidx + 1 + is_max + Mbs * i;
499bfc6803cSHong Zhang } else { /* process other processor's is[] - only use one temp table */
500430a0127SHong Zhang table_i = table[0];
501bfc6803cSHong Zhang }
5029566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(Mbs, table_i));
5039566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(Mbs, table0));
50476f244e2SHong Zhang if (n == 0) {
50576f244e2SHong Zhang nidx[1 + i] = 0; /* size of new is[i] */
50676f244e2SHong Zhang continue;
50776f244e2SHong Zhang }
50876f244e2SHong Zhang
5099371c9d4SSatish Balay isz0 = 0;
5109371c9d4SSatish Balay col_max = 0;
511dc008846SHong Zhang for (j = 0; j < n; j++) {
512dc008846SHong Zhang col = idx_i[j];
51308401ef6SPierre Jolivet PetscCheck(col < Mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index col %" PetscInt_FMT " >= Mbs %" PetscInt_FMT, col, Mbs);
514bfc6803cSHong Zhang if (!PetscBTLookupSet(table_i, col)) {
5159566063dSJacob Faibussowitsch PetscCall(PetscBTSet(table0, col));
51626fbe8dcSKarl Rupp if (whose == MINE) nidx_i[isz0] = col;
517dbe03f88SHong Zhang if (col_max < col) col_max = col;
518bfc6803cSHong Zhang isz0++;
519bfc6803cSHong Zhang }
520632d0f97SHong Zhang }
521dc008846SHong Zhang
52226fbe8dcSKarl Rupp if (whose == MINE) isz = isz0;
523fc70d14eSHong Zhang k = 0; /* no. of indices from input is[i] that have been examined */
524dc008846SHong Zhang for (row = 0; row < mbs; row++) {
5259371c9d4SSatish Balay a_start = ai[row];
5269371c9d4SSatish Balay a_end = ai[row + 1];
5279371c9d4SSatish Balay b_start = bi[row];
5289371c9d4SSatish Balay b_end = bi[row + 1];
5290472cc68SHong Zhang if (PetscBTLookup(table0, row + rstart)) { /* row is on input is[i]:
5300472cc68SHong Zhang do row search: collect all col in this row */
531dc008846SHong Zhang for (l = a_start; l < a_end; l++) { /* Amat */
532dc008846SHong Zhang col = aj[l] + rstart;
53326fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
534dc008846SHong Zhang }
535dc008846SHong Zhang for (l = b_start; l < b_end; l++) { /* Bmat */
536dc008846SHong Zhang col = garray[bj[l]];
53726fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
538dc008846SHong Zhang }
539dc008846SHong Zhang k++;
540dc008846SHong Zhang if (k >= isz0) break; /* for (row=0; row<mbs; row++) */
5410472cc68SHong Zhang } else { /* row is not on input is[i]:
542da81f932SPierre Jolivet do col search: add row onto nidx_i if there is a col in nidx_i */
543dc008846SHong Zhang for (l = a_start; l < a_end; l++) { /* Amat */
544dc008846SHong Zhang col = aj[l] + rstart;
54576f244e2SHong Zhang if (col > col_max) break;
546dc008846SHong Zhang if (PetscBTLookup(table0, col)) {
54726fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
548dc008846SHong Zhang break; /* for l = start; l<end ; l++) */
549632d0f97SHong Zhang }
550632d0f97SHong Zhang }
551dc008846SHong Zhang for (l = b_start; l < b_end; l++) { /* Bmat */
552dc008846SHong Zhang col = garray[bj[l]];
55376f244e2SHong Zhang if (col > col_max) break;
554dc008846SHong Zhang if (PetscBTLookup(table0, col)) {
55526fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
556dc008846SHong Zhang break; /* for l = start; l<end ; l++) */
557632d0f97SHong Zhang }
558dc008846SHong Zhang }
559dc008846SHong Zhang }
560bfc6803cSHong Zhang }
561dc008846SHong Zhang
562dc008846SHong Zhang if (i < is_max - 1) {
563fc70d14eSHong Zhang idx_i += n; /* ptr to input is[i+1] array */
564bfc6803cSHong Zhang nidx_i += isz; /* ptr to output is[i+1] array */
565dc008846SHong Zhang }
566fc70d14eSHong Zhang nidx[1 + i] = isz; /* size of new is[i] */
5671ab97a2aSSatish Balay } /* for each is */
5689566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table0));
5693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
570632d0f97SHong Zhang }
571