xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 71cd8a506e963ea19173ce2afedb030e5ef2c9e8)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.70 1995/08/24 22:28:22 bsmith Exp curfman $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "vec/vecimpl.h"
7 #include "inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_AIJ    *B = (Mat_AIJ*) aij->B->data;
18   int        n = B->n,i;
19   aij->colmap = (int *) PETSCMALLOC(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
20   PLogObjectMemory(mat,aij->N*sizeof(int));
21   PETSCMEMSET(aij->colmap,0,aij->N*sizeof(int));
22   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i+1;
23   return 0;
24 }
25 
26 extern int DisAssemble_MPIAIJ(Mat);
27 
28 static int MatGetReordering_MPIAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
29 {
30   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
31   int ierr;
32   if (aij->numtids == 1) {
33     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
34   } else
35     SETERRQ(1,"MatGetReordering_MPIAIJ:  not yet supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,
40                             int *idxn,Scalar *v,InsertMode addv)
41 {
42   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
43   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
44   int        cstart = aij->cstart, cend = aij->cend,row,col;
45 
46   if (aij->insertmode != NOTSETVALUES && aij->insertmode != addv) {
47     SETERRQ(1,"MatSetValues_MPIAIJ:You cannot mix inserts and adds");
48   }
49   aij->insertmode = addv;
50   for ( i=0; i<m; i++ ) {
51     if (idxm[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
52     if (idxm[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
53     if (idxm[i] >= rstart && idxm[i] < rend) {
54       row = idxm[i] - rstart;
55       for ( j=0; j<n; j++ ) {
56         if (idxn[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
57         if (idxn[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
58         if (idxn[j] >= cstart && idxn[j] < cend){
59           col = idxn[j] - cstart;
60           ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERRQ(ierr);
61         }
62         else {
63           if (aij->assembled) {
64             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
65             col = aij->colmap[idxn[j]] - 1;
66             if (col < 0 && !((Mat_AIJ*)(aij->A->data))->nonew) {
67               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
68               col =  idxn[j];
69             }
70           }
71           else col = idxn[j];
72           ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERRQ(ierr);
73         }
74       }
75     }
76     else {
77       ierr = StashValues_Private(&aij->stash,idxm[i],n,idxn,v+i*n,addv);
78       CHKERRQ(ierr);
79     }
80   }
81   return 0;
82 }
83 
84 /*
85     the assembly code is a lot like the code for vectors, we should
86     sometime derive a single assembly code that can be used for
87     either case.
88 */
89 
90 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
91 {
92   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
93   MPI_Comm    comm = mat->comm;
94   int         numtids = aij->numtids, *owners = aij->rowners;
95   int         mytid = aij->mytid;
96   MPI_Request *send_waits,*recv_waits;
97   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
98   int         tag = mat->tag, *owner,*starts,count,ierr;
99   InsertMode  addv;
100   Scalar      *rvalues,*svalues;
101 
102   /* make sure all processors are either in INSERTMODE or ADDMODE */
103   MPI_Allreduce((void *) &aij->insertmode,(void *) &addv,1,MPI_INT,
104                 MPI_BOR,comm);
105   if (addv == (ADDVALUES|INSERTVALUES)) {
106     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
107   }
108   aij->insertmode = addv; /* in case this processor had no cache */
109 
110   /*  first count number of contributors to each processor */
111   nprocs = (int *) PETSCMALLOC( 2*numtids*sizeof(int) ); CHKPTRQ(nprocs);
112   PETSCMEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
113   owner = (int *) PETSCMALLOC( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
114   for ( i=0; i<aij->stash.n; i++ ) {
115     idx = aij->stash.idx[i];
116     for ( j=0; j<numtids; j++ ) {
117       if (idx >= owners[j] && idx < owners[j+1]) {
118         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
119       }
120     }
121   }
122   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
123 
124   /* inform other processors of number of messages and max length*/
125   work = (int *) PETSCMALLOC( numtids*sizeof(int) ); CHKPTRQ(work);
126   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
127   nreceives = work[mytid];
128   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
129   nmax = work[mytid];
130   PETSCFREE(work);
131 
132   /* post receives:
133        1) each message will consist of ordered pairs
134      (global index,value) we store the global index as a double
135      to simplify the message passing.
136        2) since we don't know how long each individual message is we
137      allocate the largest needed buffer for each receive. Potentially
138      this is a lot of wasted space.
139 
140 
141        This could be done better.
142   */
143   rvalues = (Scalar *) PETSCMALLOC(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
144   CHKPTRQ(rvalues);
145   recv_waits = (MPI_Request *) PETSCMALLOC((nreceives+1)*sizeof(MPI_Request));
146   CHKPTRQ(recv_waits);
147   for ( i=0; i<nreceives; i++ ) {
148     MPI_Irecv((void *)(rvalues+3*nmax*i),3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
149               comm,recv_waits+i);
150   }
151 
152   /* do sends:
153       1) starts[i] gives the starting index in svalues for stuff going to
154          the ith processor
155   */
156   svalues = (Scalar *) PETSCMALLOC( 3*(aij->stash.n+1)*sizeof(Scalar) );
157   CHKPTRQ(svalues);
158   send_waits = (MPI_Request *) PETSCMALLOC( (nsends+1)*sizeof(MPI_Request));
159   CHKPTRQ(send_waits);
160   starts = (int *) PETSCMALLOC( numtids*sizeof(int) ); CHKPTRQ(starts);
161   starts[0] = 0;
162   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
163   for ( i=0; i<aij->stash.n; i++ ) {
164     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
165     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
166     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
167   }
168   PETSCFREE(owner);
169   starts[0] = 0;
170   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
171   count = 0;
172   for ( i=0; i<numtids; i++ ) {
173     if (procs[i]) {
174       MPI_Isend((void*)(svalues+3*starts[i]),3*nprocs[i],MPIU_SCALAR,i,tag,
175                 comm,send_waits+count++);
176     }
177   }
178   PETSCFREE(starts); PETSCFREE(nprocs);
179 
180   /* Free cache space */
181   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
182 
183   aij->svalues    = svalues;    aij->rvalues = rvalues;
184   aij->nsends     = nsends;     aij->nrecvs = nreceives;
185   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
186   aij->rmax       = nmax;
187 
188   return 0;
189 }
190 extern int MatSetUpMultiply_MPIAIJ(Mat);
191 
192 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
193 {
194   int        ierr;
195   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
196   MPI_Status  *send_status,recv_status;
197   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n;
198   int         row,col,other_disassembled;
199   Scalar      *values,val;
200   InsertMode  addv = aij->insertmode;
201 
202   /*  wait on receives */
203   while (count) {
204     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
205     /* unpack receives into our local space */
206     values = aij->rvalues + 3*imdex*aij->rmax;
207     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
208     n = n/3;
209     for ( i=0; i<n; i++ ) {
210       row = (int) PETSCREAL(values[3*i]) - aij->rstart;
211       col = (int) PETSCREAL(values[3*i+1]);
212       val = values[3*i+2];
213       if (col >= aij->cstart && col < aij->cend) {
214           col -= aij->cstart;
215         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
216       }
217       else {
218         if (aij->assembled) {
219           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
220           col = aij->colmap[col] - 1;
221           if (col < 0  && !((Mat_AIJ*)(aij->A->data))->nonew) {
222             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
223             col = (int) PETSCREAL(values[3*i+1]);
224           }
225         }
226         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
227       }
228     }
229     count--;
230   }
231   PETSCFREE(aij->recv_waits); PETSCFREE(aij->rvalues);
232 
233   /* wait on sends */
234   if (aij->nsends) {
235     send_status = (MPI_Status *) PETSCMALLOC( aij->nsends*sizeof(MPI_Status) );
236     CHKPTRQ(send_status);
237     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
238     PETSCFREE(send_status);
239   }
240   PETSCFREE(aij->send_waits); PETSCFREE(aij->svalues);
241 
242   aij->insertmode = NOTSETVALUES;
243   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
244   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
245 
246   /* determine if any processor has disassembled, if so we must
247      also disassemble ourselfs, in order that we may reassemble. */
248   MPI_Allreduce((void *) &aij->assembled,(void *) &other_disassembled,1,
249                  MPI_INT,MPI_PROD,mat->comm);
250   if (aij->assembled && !other_disassembled) {
251     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
252   }
253 
254   if (!aij->assembled && mode == FINAL_ASSEMBLY) {
255     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
256     aij->assembled = 1;
257   }
258   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
259   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
260 
261   return 0;
262 }
263 
264 static int MatZeroEntries_MPIAIJ(Mat A)
265 {
266   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
267   int ierr;
268   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
269   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
270   return 0;
271 }
272 
273 /* again this uses the same basic stratagy as in the assembly and
274    scatter create routines, we should try to do it systemamatically
275    if we can figure out the proper level of generality. */
276 
277 /* the code does not do the diagonal entries correctly unless the
278    matrix is square and the column and row owerships are identical.
279    This is a BUG. The only way to fix it seems to be to access
280    aij->A and aij->B directly and not through the MatZeroRows()
281    routine.
282 */
283 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
284 {
285   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
286   int            i,ierr,N, *rows,*owners = l->rowners,numtids = l->numtids;
287   int            *procs,*nprocs,j,found,idx,nsends,*work;
288   int            nmax,*svalues,*starts,*owner,nrecvs,mytid = l->mytid;
289   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
290   int            *lens,imdex,*lrows,*values;
291   MPI_Comm       comm = A->comm;
292   MPI_Request    *send_waits,*recv_waits;
293   MPI_Status     recv_status,*send_status;
294   IS             istmp;
295 
296   if (!l->assembled)
297     SETERRQ(1,"MatZeroRows_MPIAIJ:Must assemble matrix first");
298   ierr = ISGetLocalSize(is,&N); CHKERRQ(ierr);
299   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
300 
301   /*  first count number of contributors to each processor */
302   nprocs = (int *) PETSCMALLOC( 2*numtids*sizeof(int) ); CHKPTRQ(nprocs);
303   PETSCMEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
304   owner = (int *) PETSCMALLOC((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
305   for ( i=0; i<N; i++ ) {
306     idx = rows[i];
307     found = 0;
308     for ( j=0; j<numtids; j++ ) {
309       if (idx >= owners[j] && idx < owners[j+1]) {
310         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
311       }
312     }
313     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
314   }
315   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
316 
317   /* inform other processors of number of messages and max length*/
318   work = (int *) PETSCMALLOC( numtids*sizeof(int) ); CHKPTRQ(work);
319   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
320   nrecvs = work[mytid];
321   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
322   nmax = work[mytid];
323   PETSCFREE(work);
324 
325   /* post receives:   */
326   rvalues = (int *) PETSCMALLOC((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
327   CHKPTRQ(rvalues);
328   recv_waits = (MPI_Request *) PETSCMALLOC((nrecvs+1)*sizeof(MPI_Request));
329   CHKPTRQ(recv_waits);
330   for ( i=0; i<nrecvs; i++ ) {
331     MPI_Irecv((void *)(rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,
332               comm,recv_waits+i);
333   }
334 
335   /* do sends:
336       1) starts[i] gives the starting index in svalues for stuff going to
337          the ith processor
338   */
339   svalues = (int *) PETSCMALLOC( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
340   send_waits = (MPI_Request *) PETSCMALLOC( (nsends+1)*sizeof(MPI_Request));
341   CHKPTRQ(send_waits);
342   starts = (int *) PETSCMALLOC( (numtids+1)*sizeof(int) ); CHKPTRQ(starts);
343   starts[0] = 0;
344   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
345   for ( i=0; i<N; i++ ) {
346     svalues[starts[owner[i]]++] = rows[i];
347   }
348   ISRestoreIndices(is,&rows);
349 
350   starts[0] = 0;
351   for ( i=1; i<numtids+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
352   count = 0;
353   for ( i=0; i<numtids; i++ ) {
354     if (procs[i]) {
355       MPI_Isend((void*)(svalues+starts[i]),nprocs[i],MPI_INT,i,tag,
356                 comm,send_waits+count++);
357     }
358   }
359   PETSCFREE(starts);
360 
361   base = owners[mytid];
362 
363   /*  wait on receives */
364   lens = (int *) PETSCMALLOC( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
365   source = lens + nrecvs;
366   count = nrecvs; slen = 0;
367   while (count) {
368     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
369     /* unpack receives into our local space */
370     MPI_Get_count(&recv_status,MPI_INT,&n);
371     source[imdex]  = recv_status.MPI_SOURCE;
372     lens[imdex]  = n;
373     slen += n;
374     count--;
375   }
376   PETSCFREE(recv_waits);
377 
378   /* move the data into the send scatter */
379   lrows = (int *) PETSCMALLOC( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
380   count = 0;
381   for ( i=0; i<nrecvs; i++ ) {
382     values = rvalues + i*nmax;
383     for ( j=0; j<lens[i]; j++ ) {
384       lrows[count++] = values[j] - base;
385     }
386   }
387   PETSCFREE(rvalues); PETSCFREE(lens);
388   PETSCFREE(owner); PETSCFREE(nprocs);
389 
390   /* actually zap the local rows */
391   ierr = ISCreateSequential(MPI_COMM_SELF,slen,lrows,&istmp);
392   PLogObjectParent(A,istmp);
393   CHKERRQ(ierr);  PETSCFREE(lrows);
394   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
395   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
396   ierr = ISDestroy(istmp); CHKERRQ(ierr);
397 
398   /* wait on sends */
399   if (nsends) {
400     send_status = (MPI_Status *) PETSCMALLOC( nsends*sizeof(MPI_Status) );
401     CHKPTRQ(send_status);
402     MPI_Waitall(nsends,send_waits,send_status);
403     PETSCFREE(send_status);
404   }
405   PETSCFREE(send_waits); PETSCFREE(svalues);
406 
407 
408   return 0;
409 }
410 
411 static int MatMult_MPIAIJ(Mat aijin,Vec xx,Vec yy)
412 {
413   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
414   int        ierr;
415   if (!aij->assembled) SETERRQ(1,"MatMult_MPIAIJ:must assemble matrix first");
416   ierr = VecScatterBegin(xx,aij->lvec,INSERTVALUES,SCATTERALL,aij->Mvctx);
417   CHKERRQ(ierr);
418   ierr = MatMult(aij->A,xx,yy); CHKERRQ(ierr);
419   ierr = VecScatterEnd(xx,aij->lvec,INSERTVALUES,SCATTERALL,aij->Mvctx);
420   CHKERRQ(ierr);
421   ierr = MatMultAdd(aij->B,aij->lvec,yy,yy); CHKERRQ(ierr);
422   return 0;
423 }
424 
425 static int MatMultAdd_MPIAIJ(Mat aijin,Vec xx,Vec yy,Vec zz)
426 {
427   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
428   int        ierr;
429   if (!aij->assembled) SETERRQ(1,"MatMult_MPIAIJ:must assemble matrix first");
430   ierr = VecScatterBegin(xx,aij->lvec,INSERTVALUES,SCATTERALL,aij->Mvctx);
431   CHKERRQ(ierr);
432   ierr = MatMultAdd(aij->A,xx,yy,zz); CHKERRQ(ierr);
433   ierr = VecScatterEnd(xx,aij->lvec,INSERTVALUES,SCATTERALL,aij->Mvctx);
434   CHKERRQ(ierr);
435   ierr = MatMultAdd(aij->B,aij->lvec,zz,zz); CHKERRQ(ierr);
436   return 0;
437 }
438 
439 static int MatMultTrans_MPIAIJ(Mat aijin,Vec xx,Vec yy)
440 {
441   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
442   int        ierr;
443 
444   if (!aij->assembled)
445     SETERRQ(1,"MatMulTrans_MPIAIJ:must assemble matrix first");
446   /* do nondiagonal part */
447   ierr = MatMultTrans(aij->B,xx,aij->lvec); CHKERRQ(ierr);
448   /* send it on its way */
449   ierr = VecScatterBegin(aij->lvec,yy,ADDVALUES,
450            (ScatterMode)(SCATTERALL|SCATTERREVERSE),aij->Mvctx); CHKERRQ(ierr);
451   /* do local part */
452   ierr = MatMultTrans(aij->A,xx,yy); CHKERRQ(ierr);
453   /* receive remote parts: note this assumes the values are not actually */
454   /* inserted in yy until the next line, which is true for my implementation*/
455   /* but is not perhaps always true. */
456   ierr = VecScatterEnd(aij->lvec,yy,ADDVALUES,
457          (ScatterMode)(SCATTERALL|SCATTERREVERSE),aij->Mvctx); CHKERRQ(ierr);
458   return 0;
459 }
460 
461 static int MatMultTransAdd_MPIAIJ(Mat aijin,Vec xx,Vec yy,Vec zz)
462 {
463   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
464   int        ierr;
465 
466   if (!aij->assembled)
467     SETERRQ(1,"MatMulTransAdd_MPIAIJ:must assemble matrix first");
468   /* do nondiagonal part */
469   ierr = MatMultTrans(aij->B,xx,aij->lvec); CHKERRQ(ierr);
470   /* send it on its way */
471   ierr = VecScatterBegin(aij->lvec,zz,ADDVALUES,
472          (ScatterMode)(SCATTERALL|SCATTERREVERSE),aij->Mvctx); CHKERRQ(ierr);
473   /* do local part */
474   ierr = MatMultTransAdd(aij->A,xx,yy,zz); CHKERRQ(ierr);
475   /* receive remote parts: note this assumes the values are not actually */
476   /* inserted in yy until the next line, which is true for my implementation*/
477   /* but is not perhaps always true. */
478   ierr = VecScatterEnd(aij->lvec,zz,ADDVALUES,
479        (ScatterMode)(SCATTERALL|SCATTERREVERSE),aij->Mvctx); CHKERRQ(ierr);
480   return 0;
481 }
482 
483 /*
484   This only works correctly for square matrices where the subblock A->A is the
485    diagonal block
486 */
487 static int MatGetDiagonal_MPIAIJ(Mat Ain,Vec v)
488 {
489   Mat_MPIAIJ *A = (Mat_MPIAIJ *) Ain->data;
490   if (!A->assembled) SETERRQ(1,"MatGetDiag_MPIAIJ:must assemble matrix first");
491   return MatGetDiagonal(A->A,v);
492 }
493 
494 static int MatDestroy_MPIAIJ(PetscObject obj)
495 {
496   Mat        mat = (Mat) obj;
497   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
498   int        ierr;
499 #if defined(PETSC_LOG)
500   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
501 #endif
502   PETSCFREE(aij->rowners);
503   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
504   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
505   if (aij->colmap) PETSCFREE(aij->colmap);
506   if (aij->garray) PETSCFREE(aij->garray);
507   if (aij->lvec) VecDestroy(aij->lvec);
508   if (aij->Mvctx) VecScatterCtxDestroy(aij->Mvctx);
509   PETSCFREE(aij);
510   PLogObjectDestroy(mat);
511   PETSCHEADERDESTROY(mat);
512   return 0;
513 }
514 #include "draw.h"
515 #include "pinclude/pviewer.h"
516 
517 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
518 {
519   Mat         mat = (Mat) obj;
520   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
521   int         ierr, format;
522   PetscObject vobj = (PetscObject) viewer;
523 
524   if (!aij->assembled) SETERRQ(1,"MatView_MPIAIJ:must assemble matrix first");
525   if (!viewer) { /* so that viewers may be used from debuggers */
526     viewer = STDOUT_VIEWER_SELF; vobj = (PetscObject) viewer;
527   }
528   if (vobj->cookie == DRAW_COOKIE && vobj->type == NULLWINDOW) return 0;
529   format = ViewerFileGetFormat_Private(viewer);
530   if (vobj->cookie == VIEWER_COOKIE && format == FILE_FORMAT_INFO &&
531      (vobj->type == FILE_VIEWER || vobj->type == FILES_VIEWER)) {
532    /* do nothing for now */
533   }
534   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == FILE_VIEWER) {
535     FILE *fd = ViewerFileGetPointer_Private(viewer);
536     MPIU_Seq_begin(mat->comm,1);
537     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
538            aij->mytid,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
539            aij->cend);
540     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
541     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
542     fflush(fd);
543     MPIU_Seq_end(mat->comm,1);
544   }
545   else if ((vobj->cookie == VIEWER_COOKIE && vobj->type == FILES_VIEWER) ||
546             vobj->cookie == DRAW_COOKIE) {
547     int numtids = aij->numtids, mytid = aij->mytid;
548     if (numtids == 1) {
549       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
550     }
551     else {
552       /* assemble the entire matrix onto first processor. */
553       Mat     A;
554       Mat_AIJ *Aloc;
555       int     M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
556       Scalar  *a;
557 
558       if (!mytid) {
559         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,0,0,0,&A);
560       }
561       else {
562         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,0,0,0,&A);
563       }
564       PLogObjectParent(mat,A);
565       CHKERRQ(ierr);
566 
567       /* copy over the A part */
568       Aloc = (Mat_AIJ*) aij->A->data;
569       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
570       row = aij->rstart;
571       for ( i=0; i<ai[m]-1; i++ ) {aj[i] += aij->cstart - 1;}
572       for ( i=0; i<m; i++ ) {
573         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERTVALUES);
574         CHKERRQ(ierr);
575         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
576       }
577       aj = Aloc->j;
578       for ( i=0; i<ai[m]-1; i++ ) {aj[i] -= aij->cstart - 1;}
579 
580       /* copy over the B part */
581       Aloc = (Mat_AIJ*) aij->B->data;
582       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
583       row = aij->rstart;
584       ct = cols = (int *) PETSCMALLOC( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
585       for ( i=0; i<ai[m]-1; i++ ) {cols[i] = aij->garray[aj[i]-1];}
586       for ( i=0; i<m; i++ ) {
587         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERTVALUES);
588         CHKERRQ(ierr);
589         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
590       }
591       PETSCFREE(ct);
592       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
593       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
594       if (!mytid) {
595         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
596       }
597       ierr = MatDestroy(A); CHKERRQ(ierr);
598     }
599   }
600   return 0;
601 }
602 
603 extern int MatMarkDiag_AIJ(Mat);
604 /*
605     This has to provide several versions.
606 
607      1) per sequential
608      2) a) use only local smoothing updating outer values only once.
609         b) local smoothing updating outer values each inner iteration
610      3) color updating out values betwen colors.
611 */
612 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
613                            double shift,int its,Vec xx)
614 {
615   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
616   Mat        AA = mat->A, BB = mat->B;
617   Mat_AIJ    *A = (Mat_AIJ *) AA->data, *B = (Mat_AIJ *)BB->data;
618   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
619   int        ierr,*idx, *diag;
620   int        n = mat->n, m = mat->m, i;
621   Vec        tt;
622 
623   if (!mat->assembled) SETERRQ(1,"MatRelax_MPIAIJ:must assemble matrix first");
624 
625   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
626   xs = x -1; /* shift by one for index start of 1 */
627   ls--;
628   if (!A->diag) {if ((ierr = MatMarkDiag_AIJ(AA))) return ierr;}
629   diag = A->diag;
630   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
631     SETERRQ(1,"MatRelax_MPIAIJ:Option not yet supported");
632   }
633   if (flag & SOR_EISENSTAT) {
634     /* Let  A = L + U + D; where L is lower trianglar,
635     U is upper triangular, E is diagonal; This routine applies
636 
637             (L + E)^{-1} A (U + E)^{-1}
638 
639     to a vector efficiently using Eisenstat's trick. This is for
640     the case of SSOR preconditioner, so E is D/omega where omega
641     is the relaxation factor.
642     */
643     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
644     PLogObjectParent(matin,tt);
645     VecGetArray(tt,&t);
646     scale = (2.0/omega) - 1.0;
647     /*  x = (E + U)^{-1} b */
648     VecSet(&zero,mat->lvec);
649     ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
650                               mat->Mvctx); CHKERRQ(ierr);
651     for ( i=m-1; i>-1; i-- ) {
652       n    = A->i[i+1] - diag[i] - 1;
653       idx  = A->j + diag[i];
654       v    = A->a + diag[i];
655       sum  = b[i];
656       SPARSEDENSEMDOT(sum,xs,v,idx,n);
657       d    = shift + A->a[diag[i]-1];
658       n    = B->i[i+1] - B->i[i];
659       idx  = B->j + B->i[i] - 1;
660       v    = B->a + B->i[i] - 1;
661       SPARSEDENSEMDOT(sum,ls,v,idx,n);
662       x[i] = omega*(sum/d);
663     }
664     ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
665                             mat->Mvctx); CHKERRQ(ierr);
666 
667     /*  t = b - (2*E - D)x */
668     v = A->a;
669     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ - 1])*x[i]; }
670 
671     /*  t = (E + L)^{-1}t */
672     ts = t - 1; /* shifted by one for index start of a or mat->j*/
673     diag = A->diag;
674     VecSet(&zero,mat->lvec);
675     ierr = VecPipelineBegin(tt,mat->lvec,INSERTVALUES,PIPELINEDOWN,
676                                                  mat->Mvctx); CHKERRQ(ierr);
677     for ( i=0; i<m; i++ ) {
678       n    = diag[i] - A->i[i];
679       idx  = A->j + A->i[i] - 1;
680       v    = A->a + A->i[i] - 1;
681       sum  = t[i];
682       SPARSEDENSEMDOT(sum,ts,v,idx,n);
683       d    = shift + A->a[diag[i]-1];
684       n    = B->i[i+1] - B->i[i];
685       idx  = B->j + B->i[i] - 1;
686       v    = B->a + B->i[i] - 1;
687       SPARSEDENSEMDOT(sum,ls,v,idx,n);
688       t[i] = omega*(sum/d);
689     }
690     ierr = VecPipelineEnd(tt,mat->lvec,INSERTVALUES,PIPELINEDOWN,
691                                                     mat->Mvctx); CHKERRQ(ierr);
692     /*  x = x + t */
693     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
694     VecDestroy(tt);
695     return 0;
696   }
697 
698 
699   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
700     if (flag & SOR_ZERO_INITIAL_GUESS) {
701       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
702     }
703     else {
704       ierr=VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERUP,mat->Mvctx);
705       CHKERRQ(ierr);
706       ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERUP,mat->Mvctx);
707       CHKERRQ(ierr);
708     }
709     while (its--) {
710       /* go down through the rows */
711       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
712                               mat->Mvctx); CHKERRQ(ierr);
713       for ( i=0; i<m; i++ ) {
714         n    = A->i[i+1] - A->i[i];
715         idx  = A->j + A->i[i] - 1;
716         v    = A->a + A->i[i] - 1;
717         sum  = b[i];
718         SPARSEDENSEMDOT(sum,xs,v,idx,n);
719         d    = shift + A->a[diag[i]-1];
720         n    = B->i[i+1] - B->i[i];
721         idx  = B->j + B->i[i] - 1;
722         v    = B->a + B->i[i] - 1;
723         SPARSEDENSEMDOT(sum,ls,v,idx,n);
724         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
725       }
726       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
727                             mat->Mvctx); CHKERRQ(ierr);
728       /* come up through the rows */
729       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
730                               mat->Mvctx); CHKERRQ(ierr);
731       for ( i=m-1; i>-1; i-- ) {
732         n    = A->i[i+1] - A->i[i];
733         idx  = A->j + A->i[i] - 1;
734         v    = A->a + A->i[i] - 1;
735         sum  = b[i];
736         SPARSEDENSEMDOT(sum,xs,v,idx,n);
737         d    = shift + A->a[diag[i]-1];
738         n    = B->i[i+1] - B->i[i];
739         idx  = B->j + B->i[i] - 1;
740         v    = B->a + B->i[i] - 1;
741         SPARSEDENSEMDOT(sum,ls,v,idx,n);
742         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
743       }
744       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
745                             mat->Mvctx); CHKERRQ(ierr);
746     }
747   }
748   else if (flag & SOR_FORWARD_SWEEP){
749     if (flag & SOR_ZERO_INITIAL_GUESS) {
750       VecSet(&zero,mat->lvec);
751       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
752                               mat->Mvctx); CHKERRQ(ierr);
753       for ( i=0; i<m; i++ ) {
754         n    = diag[i] - A->i[i];
755         idx  = A->j + A->i[i] - 1;
756         v    = A->a + A->i[i] - 1;
757         sum  = b[i];
758         SPARSEDENSEMDOT(sum,xs,v,idx,n);
759         d    = shift + A->a[diag[i]-1];
760         n    = B->i[i+1] - B->i[i];
761         idx  = B->j + B->i[i] - 1;
762         v    = B->a + B->i[i] - 1;
763         SPARSEDENSEMDOT(sum,ls,v,idx,n);
764         x[i] = omega*(sum/d);
765       }
766       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
767                             mat->Mvctx); CHKERRQ(ierr);
768       its--;
769     }
770     while (its--) {
771       ierr=VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERUP,mat->Mvctx);
772       CHKERRQ(ierr);
773       ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERUP,mat->Mvctx);
774       CHKERRQ(ierr);
775       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
776                               mat->Mvctx); CHKERRQ(ierr);
777       for ( i=0; i<m; i++ ) {
778         n    = A->i[i+1] - A->i[i];
779         idx  = A->j + A->i[i] - 1;
780         v    = A->a + A->i[i] - 1;
781         sum  = b[i];
782         SPARSEDENSEMDOT(sum,xs,v,idx,n);
783         d    = shift + A->a[diag[i]-1];
784         n    = B->i[i+1] - B->i[i];
785         idx  = B->j + B->i[i] - 1;
786         v    = B->a + B->i[i] - 1;
787         SPARSEDENSEMDOT(sum,ls,v,idx,n);
788         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
789       }
790       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
791                             mat->Mvctx); CHKERRQ(ierr);
792     }
793   }
794   else if (flag & SOR_BACKWARD_SWEEP){
795     if (flag & SOR_ZERO_INITIAL_GUESS) {
796       VecSet(&zero,mat->lvec);
797       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
798                               mat->Mvctx); CHKERRQ(ierr);
799       for ( i=m-1; i>-1; i-- ) {
800         n    = A->i[i+1] - diag[i] - 1;
801         idx  = A->j + diag[i];
802         v    = A->a + diag[i];
803         sum  = b[i];
804         SPARSEDENSEMDOT(sum,xs,v,idx,n);
805         d    = shift + A->a[diag[i]-1];
806         n    = B->i[i+1] - B->i[i];
807         idx  = B->j + B->i[i] - 1;
808         v    = B->a + B->i[i] - 1;
809         SPARSEDENSEMDOT(sum,ls,v,idx,n);
810         x[i] = omega*(sum/d);
811       }
812       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
813                             mat->Mvctx); CHKERRQ(ierr);
814       its--;
815     }
816     while (its--) {
817       ierr = VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERDOWN,
818                             mat->Mvctx); CHKERRQ(ierr);
819       ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERDOWN,
820                             mat->Mvctx); CHKERRQ(ierr);
821       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
822                               mat->Mvctx); CHKERRQ(ierr);
823       for ( i=m-1; i>-1; i-- ) {
824         n    = A->i[i+1] - A->i[i];
825         idx  = A->j + A->i[i] - 1;
826         v    = A->a + A->i[i] - 1;
827         sum  = b[i];
828         SPARSEDENSEMDOT(sum,xs,v,idx,n);
829         d    = shift + A->a[diag[i]-1];
830         n    = B->i[i+1] - B->i[i];
831         idx  = B->j + B->i[i] - 1;
832         v    = B->a + B->i[i] - 1;
833         SPARSEDENSEMDOT(sum,ls,v,idx,n);
834         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
835       }
836       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
837                             mat->Mvctx); CHKERRQ(ierr);
838     }
839   }
840   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
841     if (flag & SOR_ZERO_INITIAL_GUESS) {
842       return MatRelax(mat->A,bb,omega,flag,shift,its,xx);
843     }
844     ierr=VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERALL,mat->Mvctx);
845     CHKERRQ(ierr);
846     ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERALL,mat->Mvctx);
847     CHKERRQ(ierr);
848     while (its--) {
849       /* go down through the rows */
850       for ( i=0; i<m; i++ ) {
851         n    = A->i[i+1] - A->i[i];
852         idx  = A->j + A->i[i] - 1;
853         v    = A->a + A->i[i] - 1;
854         sum  = b[i];
855         SPARSEDENSEMDOT(sum,xs,v,idx,n);
856         d    = shift + A->a[diag[i]-1];
857         n    = B->i[i+1] - B->i[i];
858         idx  = B->j + B->i[i] - 1;
859         v    = B->a + B->i[i] - 1;
860         SPARSEDENSEMDOT(sum,ls,v,idx,n);
861         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
862       }
863       /* come up through the rows */
864       for ( i=m-1; i>-1; i-- ) {
865         n    = A->i[i+1] - A->i[i];
866         idx  = A->j + A->i[i] - 1;
867         v    = A->a + A->i[i] - 1;
868         sum  = b[i];
869         SPARSEDENSEMDOT(sum,xs,v,idx,n);
870         d    = shift + A->a[diag[i]-1];
871         n    = B->i[i+1] - B->i[i];
872         idx  = B->j + B->i[i] - 1;
873         v    = B->a + B->i[i] - 1;
874         SPARSEDENSEMDOT(sum,ls,v,idx,n);
875         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
876       }
877     }
878   }
879   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
880     if (flag & SOR_ZERO_INITIAL_GUESS) {
881       return MatRelax(mat->A,bb,omega,flag,shift,its,xx);
882     }
883     ierr=VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERALL,mat->Mvctx);
884     CHKERRQ(ierr);
885     ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERALL,mat->Mvctx);
886     CHKERRQ(ierr);
887     while (its--) {
888       for ( i=0; i<m; i++ ) {
889         n    = A->i[i+1] - A->i[i];
890         idx  = A->j + A->i[i] - 1;
891         v    = A->a + A->i[i] - 1;
892         sum  = b[i];
893         SPARSEDENSEMDOT(sum,xs,v,idx,n);
894         d    = shift + A->a[diag[i]-1];
895         n    = B->i[i+1] - B->i[i];
896         idx  = B->j + B->i[i] - 1;
897         v    = B->a + B->i[i] - 1;
898         SPARSEDENSEMDOT(sum,ls,v,idx,n);
899         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
900       }
901     }
902   }
903   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
904     if (flag & SOR_ZERO_INITIAL_GUESS) {
905       return MatRelax(mat->A,bb,omega,flag,shift,its,xx);
906     }
907     ierr = VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERALL,
908                             mat->Mvctx); CHKERRQ(ierr);
909     ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERALL,
910                             mat->Mvctx); CHKERRQ(ierr);
911     while (its--) {
912       for ( i=m-1; i>-1; i-- ) {
913         n    = A->i[i+1] - A->i[i];
914         idx  = A->j + A->i[i] - 1;
915         v    = A->a + A->i[i] - 1;
916         sum  = b[i];
917         SPARSEDENSEMDOT(sum,xs,v,idx,n);
918         d    = shift + A->a[diag[i]-1];
919         n    = B->i[i+1] - B->i[i];
920         idx  = B->j + B->i[i] - 1;
921         v    = B->a + B->i[i] - 1;
922         SPARSEDENSEMDOT(sum,ls,v,idx,n);
923         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
924       }
925     }
926   }
927   return 0;
928 }
929 
930 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
931                              int *nzalloc,int *mem)
932 {
933   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
934   Mat        A = mat->A, B = mat->B;
935   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
936 
937   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
938   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
939   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
940   if (flag == MAT_LOCAL) {
941     *nz = isend[0]; *nzalloc = isend[1]; *mem = isend[2];
942   } else if (flag == MAT_GLOBAL_MAX) {
943     MPI_Allreduce((void *) isend,(void *) irecv,3,MPI_INT,MPI_MAX,matin->comm);
944     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
945   } else if (flag == MAT_GLOBAL_SUM) {
946     MPI_Allreduce((void *) isend,(void *) irecv,3,MPI_INT,MPI_SUM,matin->comm);
947     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
948   }
949   return 0;
950 }
951 
952 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
953 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
954 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
955 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
956 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
957 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
958 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
959 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
960 
961 static int MatSetOption_MPIAIJ(Mat aijin,MatOption op)
962 {
963   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
964 
965   if      (op == NO_NEW_NONZERO_LOCATIONS)  {
966     MatSetOption(aij->A,op);
967     MatSetOption(aij->B,op);
968   }
969   else if (op == YES_NEW_NONZERO_LOCATIONS) {
970     MatSetOption(aij->A,op);
971     MatSetOption(aij->B,op);
972   }
973   else if (op == COLUMN_ORIENTED)
974     SETERRQ(1,"MatSetOption_MPIAIJ:Column oriented not supported");
975   return 0;
976 }
977 
978 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
979 {
980   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
981   *m = mat->M; *n = mat->N;
982   return 0;
983 }
984 
985 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
986 {
987   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
988   *m = mat->m; *n = mat->N;
989   return 0;
990 }
991 
992 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
993 {
994   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
995   *m = mat->rstart; *n = mat->rend;
996   return 0;
997 }
998 
999 static int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1000 {
1001   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1002   Scalar     *vworkA, *vworkB, **pvA, **pvB;
1003   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1004   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1005 
1006   if (row < rstart || row >= rend)
1007     SETERRQ(1,"MatGetRow_MPIAIJ:Currently you can get only local rows")
1008   lrow = row - rstart;
1009 
1010   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1011   if (!v)   {pvA = 0; pvB = 0;}
1012   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1013   ierr = MatGetRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1014   ierr = MatGetRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1015   nztot = nzA + nzB;
1016 
1017   if (v  || idx) {
1018     if (nztot) {
1019       /* Sort by increasing column numbers, assuming A and B already sorted */
1020       int imark;
1021       if (mat->assembled) {
1022         for (i=0; i<nzB; i++) cworkB[i] = mat->garray[cworkB[i]];
1023       }
1024       if (v) {
1025         *v = (Scalar *) PETSCMALLOC( (nztot)*sizeof(Scalar) ); CHKPTRQ(*v);
1026         for ( i=0; i<nzB; i++ ) {
1027           if (cworkB[i] < cstart)   (*v)[i] = vworkB[i];
1028           else break;
1029         }
1030         imark = i;
1031         for ( i=0; i<nzA; i++ )     (*v)[imark+i] = vworkA[i];
1032         for ( i=imark; i<nzB; i++ ) (*v)[nzA+i] = vworkB[i];
1033       }
1034       if (idx) {
1035         *idx = (int *) PETSCMALLOC( (nztot)*sizeof(int) ); CHKPTRQ(*idx);
1036         for (i=0; i<nzA; i++) cworkA[i] += cstart;
1037         for ( i=0; i<nzB; i++ ) {
1038           if (cworkB[i] < cstart)   (*idx)[i] = cworkB[i];
1039           else break;
1040         }
1041         imark = i;
1042         for ( i=0; i<nzA; i++ )     (*idx)[imark+i] = cworkA[i];
1043         for ( i=imark; i<nzB; i++ ) (*idx)[nzA+i] = cworkB[i];
1044       }
1045     }
1046     else {*idx = 0; *v=0;}
1047   }
1048   *nz = nztot;
1049   ierr = MatRestoreRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1050   ierr = MatRestoreRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1051   return 0;
1052 }
1053 
1054 static int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1055 {
1056   if (idx) PETSCFREE(*idx);
1057   if (v) PETSCFREE(*v);
1058   return 0;
1059 }
1060 
1061 static int MatNorm_MPIAIJ(Mat mat,MatNormType type,double *norm)
1062 {
1063   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1064   int ierr;
1065   if (aij->numtids == 1) {
1066     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1067   } else {
1068    SETERRQ(1,"MatNorm_MPIAIJ:  no parallel norms yet.");
1069   }
1070   return 0;
1071 }
1072 
1073 static int MatTranspose_MPIAIJ(Mat A,Mat *Bin)
1074 {
1075   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1076   int        ierr;
1077   Mat        B;
1078   Mat_AIJ    *Aloc;
1079   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1080   Scalar     *array;
1081 
1082   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,0,0,0,&B);
1083   CHKERRQ(ierr);
1084 
1085   /* copy over the A part */
1086   Aloc = (Mat_AIJ*) a->A->data;
1087   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1088   row = a->rstart;
1089   for ( i=0; i<ai[m]-1; i++ ) {aj[i] += a->cstart - 1;}
1090   for ( i=0; i<m; i++ ) {
1091       ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERTVALUES);
1092       CHKERRQ(ierr);
1093       row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1094   }
1095   aj = Aloc->j;
1096   for ( i=0; i<ai[m]-1; i++ ) {aj[i] -= a->cstart - 1;}
1097 
1098   /* copy over the B part */
1099   Aloc = (Mat_AIJ*) a->B->data;
1100   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1101   row = a->rstart;
1102   ct = cols = (int *) PETSCMALLOC( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
1103   for ( i=0; i<ai[m]-1; i++ ) {cols[i] = a->garray[aj[i]-1];}
1104   for ( i=0; i<m; i++ ) {
1105     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERTVALUES);
1106     CHKERRQ(ierr);
1107     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1108   }
1109   PETSCFREE(ct);
1110   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1111   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1112   *Bin = B;
1113   return 0;
1114 }
1115 
1116 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1117 static int MatCopyPrivate_MPIAIJ(Mat,Mat *);
1118 
1119 /* -------------------------------------------------------------------*/
1120 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1121        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1122        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1123        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1124        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1125        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1126        MatLUFactor_MPIAIJ,0,
1127        MatRelax_MPIAIJ,
1128        MatTranspose_MPIAIJ,
1129        MatGetInfo_MPIAIJ,0,
1130        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1131        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1132        0,
1133        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1134        MatGetReordering_MPIAIJ,
1135        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1136        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1137        MatILUFactorSymbolic_MPIAIJ,0,
1138        0,0,MatConvert_MPIAIJ,0,0,MatCopyPrivate_MPIAIJ};
1139 
1140 /*@
1141    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1142    (the default parallel PETSc format).
1143 
1144    Input Parameters:
1145 .  comm - MPI communicator
1146 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1147 .  n - number of local columns (or PETSC_DECIDE to have calculated
1148            if N is given)
1149 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1150 .  N - number of global columns (or PETSC_DECIDE to have calculated
1151            if n is given)
1152 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1153            (same for all local rows)
1154 .  d_nzz - number of nonzeros per row in diagonal portion of local submatrix
1155            or null (possibly different for each row).  You must leave room
1156            for the diagonal entry even if it is zero.
1157 .  o_nz - number of nonzeros per row in off-diagonal portion of local
1158            submatrix (same for all local rows).
1159 .  o_nzz - number of nonzeros per row in off-diagonal portion of local
1160            submatrix or null (possibly different for each row).
1161 
1162    Output Parameter:
1163 .  newmat - the matrix
1164 
1165    Notes:
1166    The AIJ format (also called the Yale sparse matrix format or
1167    compressed row storage), is fully compatible with standard Fortran 77
1168    storage.  That is, the stored row and column indices begin at
1169    one, not zero.  See the users manual for further details.
1170 
1171    The user MUST specify either the local or global matrix dimensions
1172    (possibly both).
1173 
1174    Storage Information:
1175    For a square global matrix we define each processor's diagonal portion
1176    to be its local rows and the corresponding columns (a square submatrix);
1177    each processor's off-diagonal portion encompasses the remainder of the
1178    local matrix (a rectangular submatrix).
1179 
1180    The user can specify preallocated storage for the diagonal part of
1181    the local submatrix with either d_nz or d_nnz (not both). Set both
1182    d_nz and d_nnz to zero for PETSc to control dynamic memory allocation.
1183    Likewise, specify preallocated storage for the off-diagonal part of
1184    the local submatrix with o_nz or o_nnz (not both).
1185 
1186 .keywords: matrix, aij, compressed row, sparse, parallel
1187 
1188 .seealso: MatCreate(), MatCreateSequentialAIJ(), MatSetValues()
1189 @*/
1190 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1191                  int d_nz,int *d_nnz, int o_nz,int *o_nnz,Mat *newmat)
1192 {
1193   Mat          mat;
1194   Mat_MPIAIJ   *aij;
1195   int          ierr, i,sum[2],work[2];
1196   *newmat         = 0;
1197   PETSCHEADERCREATE(mat,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1198   PLogObjectCreate(mat);
1199   mat->data       = (void *) (aij = PETSCNEW(Mat_MPIAIJ)); CHKPTRQ(aij);
1200   mat->ops        = &MatOps;
1201   mat->destroy    = MatDestroy_MPIAIJ;
1202   mat->view       = MatView_MPIAIJ;
1203   mat->factor     = 0;
1204 
1205   aij->insertmode = NOTSETVALUES;
1206   MPI_Comm_rank(comm,&aij->mytid);
1207   MPI_Comm_size(comm,&aij->numtids);
1208 
1209   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1210     work[0] = m; work[1] = n;
1211     MPI_Allreduce((void *) work,(void *) sum,2,MPI_INT,MPI_SUM,comm );
1212     if (M == PETSC_DECIDE) M = sum[0];
1213     if (N == PETSC_DECIDE) N = sum[1];
1214   }
1215   if (m == PETSC_DECIDE)
1216     {m = M/aij->numtids + ((M % aij->numtids) > aij->mytid);}
1217   if (n == PETSC_DECIDE)
1218     {n = N/aij->numtids + ((N % aij->numtids) > aij->mytid);}
1219   aij->m = m;
1220   aij->n = n;
1221   aij->N = N;
1222   aij->M = M;
1223 
1224   /* build local table of row and column ownerships */
1225   aij->rowners = (int *) PETSCMALLOC(2*(aij->numtids+2)*sizeof(int));
1226   CHKPTRQ(aij->rowners);
1227   PLogObjectMemory(mat,2*(aij->numtids+2)*sizeof(int)+sizeof(struct _Mat)+
1228                        sizeof(Mat_MPIAIJ));
1229   aij->cowners = aij->rowners + aij->numtids + 1;
1230   MPI_Allgather(&m,1,MPI_INT,aij->rowners+1,1,MPI_INT,comm);
1231   aij->rowners[0] = 0;
1232   for ( i=2; i<=aij->numtids; i++ ) {
1233     aij->rowners[i] += aij->rowners[i-1];
1234   }
1235   aij->rstart = aij->rowners[aij->mytid];
1236   aij->rend   = aij->rowners[aij->mytid+1];
1237   MPI_Allgather(&n,1,MPI_INT,aij->cowners+1,1,MPI_INT,comm);
1238   aij->cowners[0] = 0;
1239   for ( i=2; i<=aij->numtids; i++ ) {
1240     aij->cowners[i] += aij->cowners[i-1];
1241   }
1242   aij->cstart = aij->cowners[aij->mytid];
1243   aij->cend   = aij->cowners[aij->mytid+1];
1244 
1245 
1246   ierr = MatCreateSequentialAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&aij->A);
1247   CHKERRQ(ierr);
1248   PLogObjectParent(mat,aij->A);
1249   ierr = MatCreateSequentialAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&aij->B);
1250   CHKERRQ(ierr);
1251   PLogObjectParent(mat,aij->B);
1252 
1253   /* build cache for off array entries formed */
1254   ierr = StashBuild_Private(&aij->stash); CHKERRQ(ierr);
1255   aij->colmap    = 0;
1256   aij->garray    = 0;
1257 
1258   /* stuff used for matrix vector multiply */
1259   aij->lvec      = 0;
1260   aij->Mvctx     = 0;
1261   aij->assembled = 0;
1262 
1263   *newmat = mat;
1264   return 0;
1265 }
1266 
1267 static int MatCopyPrivate_MPIAIJ(Mat matin,Mat *newmat)
1268 {
1269   Mat        mat;
1270   Mat_MPIAIJ *aij,*oldmat = (Mat_MPIAIJ *) matin->data;
1271   int        ierr, len;
1272   *newmat      = 0;
1273 
1274   if (!oldmat->assembled)
1275     SETERRQ(1,"MatCopyPrivate_MPIAIJ:Cannot copy unassembled matrix");
1276   PETSCHEADERCREATE(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1277   PLogObjectCreate(mat);
1278   mat->data       = (void *) (aij = PETSCNEW(Mat_MPIAIJ)); CHKPTRQ(aij);
1279   mat->ops        = &MatOps;
1280   mat->destroy    = MatDestroy_MPIAIJ;
1281   mat->view       = MatView_MPIAIJ;
1282   mat->factor     = matin->factor;
1283 
1284   aij->m          = oldmat->m;
1285   aij->n          = oldmat->n;
1286   aij->M          = oldmat->M;
1287   aij->N          = oldmat->N;
1288 
1289   aij->assembled  = 1;
1290   aij->rstart     = oldmat->rstart;
1291   aij->rend       = oldmat->rend;
1292   aij->cstart     = oldmat->cstart;
1293   aij->cend       = oldmat->cend;
1294   aij->numtids    = oldmat->numtids;
1295   aij->mytid      = oldmat->mytid;
1296   aij->insertmode = NOTSETVALUES;
1297 
1298   aij->rowners        = (int *) PETSCMALLOC( (aij->numtids+1)*sizeof(int) );
1299   CHKPTRQ(aij->rowners);
1300   PLogObjectMemory(mat,(aij->numtids+1)*sizeof(int)+sizeof(struct _Mat)+
1301                        sizeof(Mat_MPIAIJ));
1302   PETSCMEMCPY(aij->rowners,oldmat->rowners,(aij->numtids+1)*sizeof(int));
1303   ierr = StashInitialize_Private(&aij->stash); CHKERRQ(ierr);
1304   if (oldmat->colmap) {
1305     aij->colmap      = (int *) PETSCMALLOC( (aij->N)*sizeof(int) );
1306     CHKPTRQ(aij->colmap);
1307     PLogObjectMemory(mat,(aij->N)*sizeof(int));
1308     PETSCMEMCPY(aij->colmap,oldmat->colmap,(aij->N)*sizeof(int));
1309   } else aij->colmap = 0;
1310   if (oldmat->garray && (len = ((Mat_AIJ *) (oldmat->B->data))->n)) {
1311     aij->garray      = (int *) PETSCMALLOC(len*sizeof(int) ); CHKPTRQ(aij->garray);
1312     PLogObjectMemory(mat,len*sizeof(int));
1313     PETSCMEMCPY(aij->garray,oldmat->garray,len*sizeof(int));
1314   } else aij->garray = 0;
1315 
1316   ierr =  VecDuplicate(oldmat->lvec,&aij->lvec); CHKERRQ(ierr);
1317   PLogObjectParent(mat,aij->lvec);
1318   ierr =  VecScatterCtxCopy(oldmat->Mvctx,&aij->Mvctx); CHKERRQ(ierr);
1319   PLogObjectParent(mat,aij->Mvctx);
1320   ierr =  MatConvert(oldmat->A,MATSAME,&aij->A); CHKERRQ(ierr);
1321   PLogObjectParent(mat,aij->A);
1322   ierr =  MatConvert(oldmat->B,MATSAME,&aij->B); CHKERRQ(ierr);
1323   PLogObjectParent(mat,aij->B);
1324   *newmat = mat;
1325   return 0;
1326 }
1327