xref: /petsc/src/mat/impls/dense/mpi/mmdense.c (revision daa037dfd3c3bec8dc8659548d2b20b07c1dc6de)
1 
2 /*
3    Support for the parallel dense matrix vector multiply
4 */
5 #include <../src/mat/impls/dense/mpi/mpidense.h>
6 #include <petscblaslapack.h>
7 
8 PetscErrorCode MatSetUpMultiply_MPIDense(Mat mat)
9 {
10   Mat_MPIDense   *mdn = (Mat_MPIDense*)mat->data;
11 
12   PetscFunctionBegin;
13   /* Create local vector that is used to scatter into */
14   PetscCall(VecDestroy(&mdn->lvec));
15   if (mdn->A) {
16     PetscCall(MatCreateVecs(mdn->A,&mdn->lvec,NULL));
17     PetscCall(PetscLogObjectParent((PetscObject)mat,(PetscObject)mdn->lvec));
18   }
19   if (!mdn->Mvctx) {
20     PetscCall(PetscLayoutSetUp(mat->cmap));
21     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)mat),&mdn->Mvctx));
22     PetscCall(PetscSFSetGraphWithPattern(mdn->Mvctx,mat->cmap,PETSCSF_PATTERN_ALLGATHER));
23     PetscCall(PetscLogObjectParent((PetscObject)mat,(PetscObject)mdn->Mvctx));
24   }
25   PetscFunctionReturn(0);
26 }
27 
28 static PetscErrorCode MatCreateSubMatrices_MPIDense_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
29 
30 PetscErrorCode MatCreateSubMatrices_MPIDense(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
31 {
32   PetscInt       nmax,nstages_local,nstages,i,pos,max_no;
33 
34   PetscFunctionBegin;
35   /* Allocate memory to hold all the submatrices */
36   if (scall != MAT_REUSE_MATRIX) {
37     PetscCall(PetscCalloc1(ismax+1,submat));
38   }
39   /* Determine the number of stages through which submatrices are done */
40   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
41   if (!nmax) nmax = 1;
42   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
43 
44   /* Make sure every processor loops through the nstages */
45   PetscCall(MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
46 
47   for (i=0,pos=0; i<nstages; i++) {
48     if (pos+nmax <= ismax) max_no = nmax;
49     else if (pos == ismax) max_no = 0;
50     else                   max_no = ismax-pos;
51     PetscCall(MatCreateSubMatrices_MPIDense_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos));
52     pos += max_no;
53   }
54   PetscFunctionReturn(0);
55 }
56 /* -------------------------------------------------------------------------*/
57 PetscErrorCode MatCreateSubMatrices_MPIDense_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
58 {
59   Mat_MPIDense   *c = (Mat_MPIDense*)C->data;
60   Mat            A  = c->A;
61   Mat_SeqDense   *a = (Mat_SeqDense*)A->data,*mat;
62   PetscMPIInt    rank,size,tag0,tag1,idex,end,i;
63   PetscInt       N = C->cmap->N,rstart = C->rmap->rstart,count;
64   const PetscInt **irow,**icol,*irow_i;
65   PetscInt       *nrow,*ncol,*w1,*w3,*w4,*rtable,start;
66   PetscInt       **sbuf1,m,j,k,l,ct1,**rbuf1,row,proc;
67   PetscInt       nrqs,msz,**ptr,*ctr,*pa,*tmp,bsz,nrqr;
68   PetscInt       is_no,jmax,**rmap,*rmap_i;
69   PetscInt       ctr_j,*sbuf1_j,*rbuf1_i;
70   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
71   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status2;
72   MPI_Comm       comm;
73   PetscScalar    **rbuf2,**sbuf2;
74   PetscBool      sorted;
75 
76   PetscFunctionBegin;
77   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
78   tag0 = ((PetscObject)C)->tag;
79   PetscCallMPI(MPI_Comm_rank(comm,&rank));
80   PetscCallMPI(MPI_Comm_size(comm,&size));
81   m    = C->rmap->N;
82 
83   /* Get some new tags to keep the communication clean */
84   PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag1));
85 
86   /* Check if the col indices are sorted */
87   for (i=0; i<ismax; i++) {
88     PetscCall(ISSorted(isrow[i],&sorted));
89     PetscCheck(sorted,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
90     PetscCall(ISSorted(iscol[i],&sorted));
91     PetscCheck(sorted,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted");
92   }
93 
94   PetscCall(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,m,&rtable));
95   for (i=0; i<ismax; i++) {
96     PetscCall(ISGetIndices(isrow[i],&irow[i]));
97     PetscCall(ISGetIndices(iscol[i],&icol[i]));
98     PetscCall(ISGetLocalSize(isrow[i],&nrow[i]));
99     PetscCall(ISGetLocalSize(iscol[i],&ncol[i]));
100   }
101 
102   /* Create hash table for the mapping :row -> proc*/
103   for (i=0,j=0; i<size; i++) {
104     jmax = C->rmap->range[i+1];
105     for (; j<jmax; j++) rtable[j] = i;
106   }
107 
108   /* evaluate communication - mesg to who,length of mesg, and buffer space
109      required. Based on this, buffers are allocated, and data copied into them*/
110   PetscCall(PetscMalloc3(2*size,&w1,size,&w3,size,&w4));
111   PetscCall(PetscArrayzero(w1,size*2)); /* initialize work vector*/
112   PetscCall(PetscArrayzero(w3,size)); /* initialize work vector*/
113   for (i=0; i<ismax; i++) {
114     PetscCall(PetscArrayzero(w4,size)); /* initialize work vector*/
115     jmax   = nrow[i];
116     irow_i = irow[i];
117     for (j=0; j<jmax; j++) {
118       row  = irow_i[j];
119       proc = rtable[row];
120       w4[proc]++;
121     }
122     for (j=0; j<size; j++) {
123       if (w4[j]) { w1[2*j] += w4[j];  w3[j]++;}
124     }
125   }
126 
127   nrqs       = 0;              /* no of outgoing messages */
128   msz        = 0;              /* total mesg length (for all procs) */
129   w1[2*rank] = 0;              /* no mesg sent to self */
130   w3[rank]   = 0;
131   for (i=0; i<size; i++) {
132     if (w1[2*i])  { w1[2*i+1] = 1; nrqs++;} /* there exists a message to proc i */
133   }
134   PetscCall(PetscMalloc1(nrqs+1,&pa)); /*(proc -array)*/
135   for (i=0,j=0; i<size; i++) {
136     if (w1[2*i]) { pa[j] = i; j++; }
137   }
138 
139   /* Each message would have a header = 1 + 2*(no of IS) + data */
140   for (i=0; i<nrqs; i++) {
141     j        = pa[i];
142     w1[2*j] += w1[2*j+1] + 2* w3[j];
143     msz     += w1[2*j];
144   }
145   /* Do a global reduction to determine how many messages to expect*/
146   PetscCall(PetscMaxSum(comm,w1,&bsz,&nrqr));
147 
148   /* Allocate memory for recv buffers . Make sure rbuf1[0] exists by adding 1 to the buffer length */
149   PetscCall(PetscMalloc1(nrqr+1,&rbuf1));
150   PetscCall(PetscMalloc1(nrqr*bsz,&rbuf1[0]));
151   for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz;
152 
153   /* Post the receives */
154   PetscCall(PetscMalloc1(nrqr+1,&r_waits1));
155   for (i=0; i<nrqr; ++i) {
156     PetscCallMPI(MPI_Irecv(rbuf1[i],bsz,MPIU_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i));
157   }
158 
159   /* Allocate Memory for outgoing messages */
160   PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
161   PetscCall(PetscArrayzero(sbuf1,size));
162   PetscCall(PetscArrayzero(ptr,size));
163   {
164     PetscInt *iptr = tmp,ict = 0;
165     for (i=0; i<nrqs; i++) {
166       j        = pa[i];
167       iptr    += ict;
168       sbuf1[j] = iptr;
169       ict      = w1[2*j];
170     }
171   }
172 
173   /* Form the outgoing messages */
174   /* Initialize the header space */
175   for (i=0; i<nrqs; i++) {
176     j           = pa[i];
177     sbuf1[j][0] = 0;
178     PetscCall(PetscArrayzero(sbuf1[j]+1,2*w3[j]));
179     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
180   }
181 
182   /* Parse the isrow and copy data into outbuf */
183   for (i=0; i<ismax; i++) {
184     PetscCall(PetscArrayzero(ctr,size));
185     irow_i = irow[i];
186     jmax   = nrow[i];
187     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
188       row  = irow_i[j];
189       proc = rtable[row];
190       if (proc != rank) { /* copy to the outgoing buf*/
191         ctr[proc]++;
192         *ptr[proc] = row;
193         ptr[proc]++;
194       }
195     }
196     /* Update the headers for the current IS */
197     for (j=0; j<size; j++) { /* Can Optimise this loop too */
198       if ((ctr_j = ctr[j])) {
199         sbuf1_j        = sbuf1[j];
200         k              = ++sbuf1_j[0];
201         sbuf1_j[2*k]   = ctr_j;
202         sbuf1_j[2*k-1] = i;
203       }
204     }
205   }
206 
207   /*  Now  post the sends */
208   PetscCall(PetscMalloc1(nrqs+1,&s_waits1));
209   for (i=0; i<nrqs; ++i) {
210     j    = pa[i];
211     PetscCallMPI(MPI_Isend(sbuf1[j],w1[2*j],MPIU_INT,j,tag0,comm,s_waits1+i));
212   }
213 
214   /* Post receives to capture the row_data from other procs */
215   PetscCall(PetscMalloc1(nrqs+1,&r_waits2));
216   PetscCall(PetscMalloc1(nrqs+1,&rbuf2));
217   for (i=0; i<nrqs; i++) {
218     j     = pa[i];
219     count = (w1[2*j] - (2*sbuf1[j][0] + 1))*N;
220     PetscCall(PetscMalloc1(count+1,&rbuf2[i]));
221     PetscCallMPI(MPI_Irecv(rbuf2[i],count,MPIU_SCALAR,j,tag1,comm,r_waits2+i));
222   }
223 
224   /* Receive messages(row_nos) and then, pack and send off the rowvalues
225      to the correct processors */
226 
227   PetscCall(PetscMalloc1(nrqr+1,&s_waits2));
228   PetscCall(PetscMalloc1(nrqr+1,&r_status1));
229   PetscCall(PetscMalloc1(nrqr+1,&sbuf2));
230 
231   {
232     PetscScalar *sbuf2_i,*v_start;
233     PetscInt    s_proc;
234     for (i=0; i<nrqr; ++i) {
235       PetscCallMPI(MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i));
236       s_proc  = r_status1[i].MPI_SOURCE;         /* send processor */
237       rbuf1_i = rbuf1[idex];         /* Actual message from s_proc */
238       /* no of rows = end - start; since start is array idex[], 0idex, whel end
239          is length of the buffer - which is 1idex */
240       start = 2*rbuf1_i[0] + 1;
241       PetscCallMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end));
242       /* allocate memory sufficinet to hold all the row values */
243       PetscCall(PetscMalloc1((end-start)*N,&sbuf2[idex]));
244       sbuf2_i = sbuf2[idex];
245       /* Now pack the data */
246       for (j=start; j<end; j++) {
247         row     = rbuf1_i[j] - rstart;
248         v_start = a->v + row;
249         for (k=0; k<N; k++) {
250           sbuf2_i[0] = v_start[0];
251           sbuf2_i++;
252           v_start += a->lda;
253         }
254       }
255       /* Now send off the data */
256       PetscCallMPI(MPI_Isend(sbuf2[idex],(end-start)*N,MPIU_SCALAR,s_proc,tag1,comm,s_waits2+i));
257     }
258   }
259   /* End Send-Recv of IS + row_numbers */
260   PetscCall(PetscFree(r_status1));
261   PetscCall(PetscFree(r_waits1));
262   PetscCall(PetscMalloc1(nrqs+1,&s_status1));
263   if (nrqs) PetscCallMPI(MPI_Waitall(nrqs,s_waits1,s_status1));
264   PetscCall(PetscFree(s_status1));
265   PetscCall(PetscFree(s_waits1));
266 
267   /* Create the submatrices */
268   if (scall == MAT_REUSE_MATRIX) {
269     for (i=0; i<ismax; i++) {
270       mat = (Mat_SeqDense*)(submats[i]->data);
271       PetscCheckFalse((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
272       PetscCall(PetscArrayzero(mat->v,submats[i]->rmap->n*submats[i]->cmap->n));
273 
274       submats[i]->factortype = C->factortype;
275     }
276   } else {
277     for (i=0; i<ismax; i++) {
278       PetscCall(MatCreate(PETSC_COMM_SELF,submats+i));
279       PetscCall(MatSetSizes(submats[i],nrow[i],ncol[i],nrow[i],ncol[i]));
280       PetscCall(MatSetType(submats[i],((PetscObject)A)->type_name));
281       PetscCall(MatSeqDenseSetPreallocation(submats[i],NULL));
282     }
283   }
284 
285   /* Assemble the matrices */
286   {
287     PetscInt    col;
288     PetscScalar *imat_v,*mat_v,*imat_vi,*mat_vi;
289 
290     for (i=0; i<ismax; i++) {
291       mat    = (Mat_SeqDense*)submats[i]->data;
292       mat_v  = a->v;
293       imat_v = mat->v;
294       irow_i = irow[i];
295       m      = nrow[i];
296       for (j=0; j<m; j++) {
297         row  = irow_i[j];
298         proc = rtable[row];
299         if (proc == rank) {
300           row     = row - rstart;
301           mat_vi  = mat_v + row;
302           imat_vi = imat_v + j;
303           for (k=0; k<ncol[i]; k++) {
304             col          = icol[i][k];
305             imat_vi[k*m] = mat_vi[col*a->lda];
306           }
307         }
308       }
309     }
310   }
311 
312   /* Create row map-> This maps c->row to submat->row for each submat*/
313   /* this is a very expensive operation wrt memory usage */
314   PetscCall(PetscMalloc1(ismax,&rmap));
315   PetscCall(PetscCalloc1(ismax*C->rmap->N,&rmap[0]));
316   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
317   for (i=0; i<ismax; i++) {
318     rmap_i = rmap[i];
319     irow_i = irow[i];
320     jmax   = nrow[i];
321     for (j=0; j<jmax; j++) {
322       rmap_i[irow_i[j]] = j;
323     }
324   }
325 
326   /* Now Receive the row_values and assemble the rest of the matrix */
327   PetscCall(PetscMalloc1(nrqs+1,&r_status2));
328   {
329     PetscInt    is_max,tmp1,col,*sbuf1_i,is_sz;
330     PetscScalar *rbuf2_i,*imat_v,*imat_vi;
331 
332     for (tmp1=0; tmp1<nrqs; tmp1++) { /* For each message */
333       PetscCallMPI(MPI_Waitany(nrqs,r_waits2,&i,r_status2+tmp1));
334       /* Now dig out the corresponding sbuf1, which contains the IS data_structure */
335       sbuf1_i = sbuf1[pa[i]];
336       is_max  = sbuf1_i[0];
337       ct1     = 2*is_max+1;
338       rbuf2_i = rbuf2[i];
339       for (j=1; j<=is_max; j++) { /* For each IS belonging to the message */
340         is_no  = sbuf1_i[2*j-1];
341         is_sz  = sbuf1_i[2*j];
342         mat    = (Mat_SeqDense*)submats[is_no]->data;
343         imat_v = mat->v;
344         rmap_i = rmap[is_no];
345         m      = nrow[is_no];
346         for (k=0; k<is_sz; k++,rbuf2_i+=N) {  /* For each row */
347           row     = sbuf1_i[ct1]; ct1++;
348           row     = rmap_i[row];
349           imat_vi = imat_v + row;
350           for (l=0; l<ncol[is_no]; l++) { /* For each col */
351             col          = icol[is_no][l];
352             imat_vi[l*m] = rbuf2_i[col];
353           }
354         }
355       }
356     }
357   }
358   /* End Send-Recv of row_values */
359   PetscCall(PetscFree(r_status2));
360   PetscCall(PetscFree(r_waits2));
361   PetscCall(PetscMalloc1(nrqr+1,&s_status2));
362   if (nrqr) PetscCallMPI(MPI_Waitall(nrqr,s_waits2,s_status2));
363   PetscCall(PetscFree(s_status2));
364   PetscCall(PetscFree(s_waits2));
365 
366   /* Restore the indices */
367   for (i=0; i<ismax; i++) {
368     PetscCall(ISRestoreIndices(isrow[i],irow+i));
369     PetscCall(ISRestoreIndices(iscol[i],icol+i));
370   }
371 
372   PetscCall(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,rtable));
373   PetscCall(PetscFree3(w1,w3,w4));
374   PetscCall(PetscFree(pa));
375 
376   for (i=0; i<nrqs; ++i) {
377     PetscCall(PetscFree(rbuf2[i]));
378   }
379   PetscCall(PetscFree(rbuf2));
380   PetscCall(PetscFree4(sbuf1,ptr,tmp,ctr));
381   PetscCall(PetscFree(rbuf1[0]));
382   PetscCall(PetscFree(rbuf1));
383 
384   for (i=0; i<nrqr; ++i) {
385     PetscCall(PetscFree(sbuf2[i]));
386   }
387 
388   PetscCall(PetscFree(sbuf2));
389   PetscCall(PetscFree(rmap[0]));
390   PetscCall(PetscFree(rmap));
391 
392   for (i=0; i<ismax; i++) {
393     PetscCall(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY));
394     PetscCall(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY));
395   }
396   PetscFunctionReturn(0);
397 }
398 
399 PETSC_INTERN PetscErrorCode MatScale_MPIDense(Mat inA,PetscScalar alpha)
400 {
401   Mat_MPIDense   *A = (Mat_MPIDense*)inA->data;
402 
403   PetscFunctionBegin;
404   PetscCall(MatScale(A->A,alpha));
405   PetscFunctionReturn(0);
406 }
407