xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision ed2daf618d39a4e4da77b7c71505379737a539fc)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.100 1995/11/25 23:49:23 curfman Exp curfman $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "vec/vecimpl.h"
7 #include "inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
18   int        n = B->n,i,shift = B->indexshift;
19 
20   aij->colmap = (int *) PetscMalloc(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
21   PLogObjectMemory(mat,aij->N*sizeof(int));
22   PetscMemzero(aij->colmap,aij->N*sizeof(int));
23   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i-shift;
24   return 0;
25 }
26 
27 extern int DisAssemble_MPIAIJ(Mat);
28 
29 static int MatGetReordering_MPIAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
30 {
31   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
32   int ierr;
33   if (aij->size == 1) {
34     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
35   } else SETERRQ(1,"MatGetReordering_MPIAIJ:not supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,
40                                int *idxn,Scalar *v,InsertMode addv)
41 {
42   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
43   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
44   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
45   int        cstart = aij->cstart, cend = aij->cend,row,col;
46   int        shift = C->indexshift;
47 
48   if (aij->insertmode != NOT_SET_VALUES && aij->insertmode != addv) {
49     SETERRQ(1,"MatSetValues_MPIAIJ:Cannot mix inserts and adds");
50   }
51   aij->insertmode = addv;
52   for ( i=0; i<m; i++ ) {
53     if (idxm[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
54     if (idxm[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
55     if (idxm[i] >= rstart && idxm[i] < rend) {
56       row = idxm[i] - rstart;
57       for ( j=0; j<n; j++ ) {
58         if (idxn[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
59         if (idxn[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
60         if (idxn[j] >= cstart && idxn[j] < cend){
61           col = idxn[j] - cstart;
62           ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERRQ(ierr);
63         }
64         else {
65           if (aij->assembled) {
66             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
67             col = aij->colmap[idxn[j]] + shift;
68             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
69               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
70               col =  idxn[j];
71             }
72           }
73           else col = idxn[j];
74           ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERRQ(ierr);
75         }
76       }
77     }
78     else {
79       ierr = StashValues_Private(&aij->stash,idxm[i],n,idxn,v+i*n,addv);CHKERRQ(ierr);
80     }
81   }
82   return 0;
83 }
84 
85 static int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
86 {
87   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
88   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
89   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
90   int        cstart = aij->cstart, cend = aij->cend,row,col;
91   int        shift = C->indexshift;
92 
93   if (!aij->assembled) SETERRQ(1,"MatGetValues_MPIAIJ:Not for unassembled matrix");
94   for ( i=0; i<m; i++ ) {
95     if (idxm[i] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative row");
96     if (idxm[i] >= aij->M) SETERRQ(1,"MatGetValues_MPIAIJ:Row too large");
97     if (idxm[i] >= rstart && idxm[i] < rend) {
98       row = idxm[i] - rstart;
99       for ( j=0; j<n; j++ ) {
100         if (idxn[j] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative column");
101         if (idxn[j] >= aij->N) SETERRQ(1,"MatGetValues_MPIAIJ:Col too large");
102         if (idxn[j] >= cstart && idxn[j] < cend){
103           col = idxn[j] - cstart;
104           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
105         }
106         else {
107           col = aij->colmap[idxn[j]] + shift;
108           ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
109         }
110       }
111     }
112     else {
113       SETERRQ(1,"MatGetValues_MPIAIJ:Only local values currently supported");
114     }
115   }
116   return 0;
117 }
118 
119 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
120 {
121   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
122   MPI_Comm    comm = mat->comm;
123   int         size = aij->size, *owners = aij->rowners;
124   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
125   MPI_Request *send_waits,*recv_waits;
126   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
127   InsertMode  addv;
128   Scalar      *rvalues,*svalues;
129 
130   /* make sure all processors are either in INSERTMODE or ADDMODE */
131   MPI_Allreduce(&aij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
132   if (addv == (ADD_VALUES|INSERT_VALUES)) {
133     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
134   }
135   aij->insertmode = addv; /* in case this processor had no cache */
136 
137   /*  first count number of contributors to each processor */
138   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
139   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
140   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
141   for ( i=0; i<aij->stash.n; i++ ) {
142     idx = aij->stash.idx[i];
143     for ( j=0; j<size; j++ ) {
144       if (idx >= owners[j] && idx < owners[j+1]) {
145         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
146       }
147     }
148   }
149   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
150 
151   /* inform other processors of number of messages and max length*/
152   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
153   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
154   nreceives = work[rank];
155   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
156   nmax = work[rank];
157   PetscFree(work);
158 
159   /* post receives:
160        1) each message will consist of ordered pairs
161      (global index,value) we store the global index as a double
162      to simplify the message passing.
163        2) since we don't know how long each individual message is we
164      allocate the largest needed buffer for each receive. Potentially
165      this is a lot of wasted space.
166 
167 
168        This could be done better.
169   */
170   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
171   CHKPTRQ(rvalues);
172   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
173   CHKPTRQ(recv_waits);
174   for ( i=0; i<nreceives; i++ ) {
175     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
176               comm,recv_waits+i);
177   }
178 
179   /* do sends:
180       1) starts[i] gives the starting index in svalues for stuff going to
181          the ith processor
182   */
183   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
184   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
185   CHKPTRQ(send_waits);
186   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
187   starts[0] = 0;
188   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
189   for ( i=0; i<aij->stash.n; i++ ) {
190     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
191     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
192     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
193   }
194   PetscFree(owner);
195   starts[0] = 0;
196   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
197   count = 0;
198   for ( i=0; i<size; i++ ) {
199     if (procs[i]) {
200       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
201                 comm,send_waits+count++);
202     }
203   }
204   PetscFree(starts); PetscFree(nprocs);
205 
206   /* Free cache space */
207   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
208 
209   aij->svalues    = svalues;    aij->rvalues    = rvalues;
210   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
211   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
212   aij->rmax       = nmax;
213 
214   return 0;
215 }
216 extern int MatSetUpMultiply_MPIAIJ(Mat);
217 
218 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
219 {
220   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
221   Mat_SeqAIJ *C = (Mat_SeqAIJ *) aij->A->data;
222   MPI_Status  *send_status,recv_status;
223   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
224   int         row,col,other_disassembled,shift = C->indexshift;
225   Scalar      *values,val;
226   InsertMode  addv = aij->insertmode;
227 
228   /*  wait on receives */
229   while (count) {
230     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
231     /* unpack receives into our local space */
232     values = aij->rvalues + 3*imdex*aij->rmax;
233     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
234     n = n/3;
235     for ( i=0; i<n; i++ ) {
236       row = (int) PETSCREAL(values[3*i]) - aij->rstart;
237       col = (int) PETSCREAL(values[3*i+1]);
238       val = values[3*i+2];
239       if (col >= aij->cstart && col < aij->cend) {
240         col -= aij->cstart;
241         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
242       }
243       else {
244         if (aij->assembled) {
245           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
246           col = aij->colmap[col] + shift;
247           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
248             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
249             col = (int) PETSCREAL(values[3*i+1]);
250           }
251         }
252         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
253       }
254     }
255     count--;
256   }
257   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
258 
259   /* wait on sends */
260   if (aij->nsends) {
261     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));
262     CHKPTRQ(send_status);
263     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
264     PetscFree(send_status);
265   }
266   PetscFree(aij->send_waits); PetscFree(aij->svalues);
267 
268   aij->insertmode = NOT_SET_VALUES;
269   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
270   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
271 
272   /* determine if any processor has disassembled, if so we must
273      also disassemble ourselfs, in order that we may reassemble. */
274   MPI_Allreduce(&aij->assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
275   if (aij->assembled && !other_disassembled) {
276     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
277   }
278 
279   if (!aij->assembled && mode == FINAL_ASSEMBLY) {
280     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
281     aij->assembled = 1;
282   }
283   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
284   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
285 
286   return 0;
287 }
288 
289 static int MatZeroEntries_MPIAIJ(Mat A)
290 {
291   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
292   int        ierr;
293   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
294   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
295   return 0;
296 }
297 
298 /* the code does not do the diagonal entries correctly unless the
299    matrix is square and the column and row owerships are identical.
300    This is a BUG. The only way to fix it seems to be to access
301    aij->A and aij->B directly and not through the MatZeroRows()
302    routine.
303 */
304 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
305 {
306   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
307   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
308   int            *procs,*nprocs,j,found,idx,nsends,*work;
309   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
310   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
311   int            *lens,imdex,*lrows,*values;
312   MPI_Comm       comm = A->comm;
313   MPI_Request    *send_waits,*recv_waits;
314   MPI_Status     recv_status,*send_status;
315   IS             istmp;
316 
317   if (!l->assembled) SETERRQ(1,"MatZeroRows_MPIAIJ:Must assemble matrix");
318   ierr = ISGetLocalSize(is,&N); CHKERRQ(ierr);
319   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
320 
321   /*  first count number of contributors to each processor */
322   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
323   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
324   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
325   for ( i=0; i<N; i++ ) {
326     idx = rows[i];
327     found = 0;
328     for ( j=0; j<size; j++ ) {
329       if (idx >= owners[j] && idx < owners[j+1]) {
330         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
331       }
332     }
333     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
334   }
335   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
336 
337   /* inform other processors of number of messages and max length*/
338   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
339   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
340   nrecvs = work[rank];
341   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
342   nmax = work[rank];
343   PetscFree(work);
344 
345   /* post receives:   */
346   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
347   CHKPTRQ(rvalues);
348   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
349   CHKPTRQ(recv_waits);
350   for ( i=0; i<nrecvs; i++ ) {
351     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
352   }
353 
354   /* do sends:
355       1) starts[i] gives the starting index in svalues for stuff going to
356          the ith processor
357   */
358   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
359   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
360   CHKPTRQ(send_waits);
361   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
362   starts[0] = 0;
363   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
364   for ( i=0; i<N; i++ ) {
365     svalues[starts[owner[i]]++] = rows[i];
366   }
367   ISRestoreIndices(is,&rows);
368 
369   starts[0] = 0;
370   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
371   count = 0;
372   for ( i=0; i<size; i++ ) {
373     if (procs[i]) {
374       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
375     }
376   }
377   PetscFree(starts);
378 
379   base = owners[rank];
380 
381   /*  wait on receives */
382   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
383   source = lens + nrecvs;
384   count  = nrecvs; slen = 0;
385   while (count) {
386     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
387     /* unpack receives into our local space */
388     MPI_Get_count(&recv_status,MPI_INT,&n);
389     source[imdex]  = recv_status.MPI_SOURCE;
390     lens[imdex]  = n;
391     slen += n;
392     count--;
393   }
394   PetscFree(recv_waits);
395 
396   /* move the data into the send scatter */
397   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
398   count = 0;
399   for ( i=0; i<nrecvs; i++ ) {
400     values = rvalues + i*nmax;
401     for ( j=0; j<lens[i]; j++ ) {
402       lrows[count++] = values[j] - base;
403     }
404   }
405   PetscFree(rvalues); PetscFree(lens);
406   PetscFree(owner); PetscFree(nprocs);
407 
408   /* actually zap the local rows */
409   ierr = ISCreateSeq(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
410   PLogObjectParent(A,istmp);
411   PetscFree(lrows);
412   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
413   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
414   ierr = ISDestroy(istmp); CHKERRQ(ierr);
415 
416   /* wait on sends */
417   if (nsends) {
418     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
419     CHKPTRQ(send_status);
420     MPI_Waitall(nsends,send_waits,send_status);
421     PetscFree(send_status);
422   }
423   PetscFree(send_waits); PetscFree(svalues);
424 
425   return 0;
426 }
427 
428 static int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
429 {
430   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
431   int        ierr;
432 
433   if (!a->assembled) SETERRQ(1,"MatMult_MPIAIJ:must assemble matrix");
434   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
435   ierr = MatMult(a->A,xx,yy); CHKERRQ(ierr);
436   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
437   ierr = MatMultAdd(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
438   return 0;
439 }
440 
441 static int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
442 {
443   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
444   int        ierr;
445   if (!a->assembled) SETERRQ(1,"MatMult_MPIAIJ:must assemble matrix");
446   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
447   ierr = MatMultAdd(a->A,xx,yy,zz); CHKERRQ(ierr);
448   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
449   ierr = MatMultAdd(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
450   return 0;
451 }
452 
453 static int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
454 {
455   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
456   int        ierr;
457 
458   if (!a->assembled) SETERRQ(1,"MatMulTrans_MPIAIJ:must assemble matrix");
459   /* do nondiagonal part */
460   ierr = MatMultTrans(a->B,xx,a->lvec); CHKERRQ(ierr);
461   /* send it on its way */
462   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,
463                 (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
464   /* do local part */
465   ierr = MatMultTrans(a->A,xx,yy); CHKERRQ(ierr);
466   /* receive remote parts: note this assumes the values are not actually */
467   /* inserted in yy until the next line, which is true for my implementation*/
468   /* but is not perhaps always true. */
469   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,
470                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
471   return 0;
472 }
473 
474 static int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
475 {
476   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
477   int        ierr;
478 
479   if (!a->assembled) SETERRQ(1,"MatMulTransAdd_MPIAIJ:must assemble matrix");
480   /* do nondiagonal part */
481   ierr = MatMultTrans(a->B,xx,a->lvec); CHKERRQ(ierr);
482   /* send it on its way */
483   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,
484                  (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
485   /* do local part */
486   ierr = MatMultTransAdd(a->A,xx,yy,zz); CHKERRQ(ierr);
487   /* receive remote parts: note this assumes the values are not actually */
488   /* inserted in yy until the next line, which is true for my implementation*/
489   /* but is not perhaps always true. */
490   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,
491                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
492   return 0;
493 }
494 
495 /*
496   This only works correctly for square matrices where the subblock A->A is the
497    diagonal block
498 */
499 static int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
500 {
501   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
502   if (!a->assembled) SETERRQ(1,"MatGetDiag_MPIAIJ:must assemble matrix");
503   return MatGetDiagonal(a->A,v);
504 }
505 
506 static int MatDestroy_MPIAIJ(PetscObject obj)
507 {
508   Mat        mat = (Mat) obj;
509   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
510   int        ierr;
511 #if defined(PETSC_LOG)
512   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
513 #endif
514   PetscFree(aij->rowners);
515   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
516   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
517   if (aij->colmap) PetscFree(aij->colmap);
518   if (aij->garray) PetscFree(aij->garray);
519   if (aij->lvec)   VecDestroy(aij->lvec);
520   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
521   PetscFree(aij);
522   PLogObjectDestroy(mat);
523   PetscHeaderDestroy(mat);
524   return 0;
525 }
526 #include "draw.h"
527 #include "pinclude/pviewer.h"
528 
529 static int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
530 {
531   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
532   int         ierr;
533 
534   if (aij->size == 1) {
535     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
536   }
537   else SETERRQ(1,"MatView_MPIAIJ_Binary:Only uniprocessor output supported");
538   return 0;
539 }
540 
541 static int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
542 {
543   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
544   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
545   int         ierr, format,shift = C->indexshift,rank;
546   PetscObject vobj = (PetscObject) viewer;
547   FILE        *fd;
548 
549   if (vobj->type == ASCII_FILE_VIEWER || vobj->type == ASCII_FILES_VIEWER) {
550     ierr = ViewerFileGetFormat_Private(viewer,&format);
551     if (format == FILE_FORMAT_INFO_DETAILED) {
552       int nz,nzalloc,mem;
553       MPI_Comm_rank(mat->comm,&rank);
554       ierr = ViewerFileGetPointer_Private(viewer,&fd); CHKERRQ(ierr);
555       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
556       MPIU_Seq_begin(mat->comm,1);
557       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d \n",rank,aij->m,nz,
558                 nzalloc,mem);
559       ierr = MatGetInfo(aij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
560       fprintf(fd,"[%d] on diagonal nz %d \n",rank,nz);
561       ierr = MatGetInfo(aij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
562       fprintf(fd,"[%d] off diagonal nz %d \n",rank,nz);
563       fflush(fd);
564       MPIU_Seq_end(mat->comm,1);
565       ierr = VecScatterView(aij->Mvctx,viewer);
566       return 0;
567     }
568     else if (format == FILE_FORMAT_INFO) {
569       return 0;
570     }
571   }
572 
573   if (vobj->type == ASCII_FILE_VIEWER) {
574     ierr = ViewerFileGetPointer_Private(viewer,&fd); CHKERRQ(ierr);
575     MPIU_Seq_begin(mat->comm,1);
576     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
577            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
578            aij->cend);
579     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
580     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
581     fflush(fd);
582     MPIU_Seq_end(mat->comm,1);
583   }
584   else {
585     int size = aij->size;
586     rank = aij->rank;
587     if (size == 1) {
588       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
589     }
590     else {
591       /* assemble the entire matrix onto first processor. */
592       Mat         A;
593       Mat_SeqAIJ *Aloc;
594       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
595       Scalar      *a;
596 
597       if (!rank) {
598         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PetscNull,0,PetscNull,&A);
599                CHKERRQ(ierr);
600       }
601       else {
602         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PetscNull,0,PetscNull,&A);
603                CHKERRQ(ierr);
604       }
605       PLogObjectParent(mat,A);
606 
607       /* copy over the A part */
608       Aloc = (Mat_SeqAIJ*) aij->A->data;
609       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
610       row = aij->rstart;
611       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
612       for ( i=0; i<m; i++ ) {
613         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
614         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
615       }
616       aj = Aloc->j;
617       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
618 
619       /* copy over the B part */
620       Aloc = (Mat_SeqAIJ*) aij->B->data;
621       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
622       row = aij->rstart;
623       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
624       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
625       for ( i=0; i<m; i++ ) {
626         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
627         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
628       }
629       PetscFree(ct);
630       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
631       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
632       if (!rank) {
633         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
634       }
635       ierr = MatDestroy(A); CHKERRQ(ierr);
636     }
637   }
638   return 0;
639 }
640 
641 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
642 {
643   Mat         mat = (Mat) obj;
644   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
645   int         ierr;
646   PetscObject vobj = (PetscObject) viewer;
647 
648   if (!aij->assembled) SETERRQ(1,"MatView_MPIAIJ:must assemble matrix");
649   if (!viewer) {
650     viewer = STDOUT_VIEWER_SELF; vobj = (PetscObject) viewer;
651   }
652   if (vobj->cookie == DRAW_COOKIE && vobj->type == NULLWINDOW) return 0;
653   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == ASCII_FILE_VIEWER) {
654     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
655   }
656   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == ASCII_FILES_VIEWER) {
657     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
658   }
659   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == MATLAB_VIEWER) {
660     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
661   }
662   else if (vobj->cookie == DRAW_COOKIE) {
663     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
664   }
665   else if (vobj->type == BINARY_FILE_VIEWER) {
666     return MatView_MPIAIJ_Binary(mat,viewer);
667   }
668   return 0;
669 }
670 
671 extern int MatMarkDiag_SeqAIJ(Mat);
672 /*
673     This has to provide several versions.
674 
675      1) per sequential
676      2) a) use only local smoothing updating outer values only once.
677         b) local smoothing updating outer values each inner iteration
678      3) color updating out values betwen colors.
679 */
680 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
681                            double fshift,int its,Vec xx)
682 {
683   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
684   Mat        AA = mat->A, BB = mat->B;
685   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
686   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
687   int        ierr,*idx, *diag;
688   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
689   Vec        tt;
690 
691   if (!mat->assembled) SETERRQ(1,"MatRelax_MPIAIJ:must assemble matrix");
692 
693   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
694   xs = x + shift; /* shift by one for index start of 1 */
695   ls = ls + shift;
696   if (!A->diag) {if ((ierr = MatMarkDiag_SeqAIJ(AA))) return ierr;}
697   diag = A->diag;
698   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
699     SETERRQ(1,"MatRelax_MPIAIJ:Option not supported");
700   }
701   if (flag & SOR_EISENSTAT) {
702     /* Let  A = L + U + D; where L is lower trianglar,
703     U is upper triangular, E is diagonal; This routine applies
704 
705             (L + E)^{-1} A (U + E)^{-1}
706 
707     to a vector efficiently using Eisenstat's trick. This is for
708     the case of SSOR preconditioner, so E is D/omega where omega
709     is the relaxation factor.
710     */
711     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
712     PLogObjectParent(matin,tt);
713     VecGetArray(tt,&t);
714     scale = (2.0/omega) - 1.0;
715     /*  x = (E + U)^{-1} b */
716     VecSet(&zero,mat->lvec);
717     ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
718                               mat->Mvctx); CHKERRQ(ierr);
719     for ( i=m-1; i>-1; i-- ) {
720       n    = A->i[i+1] - diag[i] - 1;
721       idx  = A->j + diag[i] + !shift;
722       v    = A->a + diag[i] + !shift;
723       sum  = b[i];
724       SPARSEDENSEMDOT(sum,xs,v,idx,n);
725       d    = fshift + A->a[diag[i]+shift];
726       n    = B->i[i+1] - B->i[i];
727       idx  = B->j + B->i[i] + shift;
728       v    = B->a + B->i[i] + shift;
729       SPARSEDENSEMDOT(sum,ls,v,idx,n);
730       x[i] = omega*(sum/d);
731     }
732     ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
733                             mat->Mvctx); CHKERRQ(ierr);
734 
735     /*  t = b - (2*E - D)x */
736     v = A->a;
737     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ + shift])*x[i]; }
738 
739     /*  t = (E + L)^{-1}t */
740     ts = t + shift; /* shifted by one for index start of a or mat->j*/
741     diag = A->diag;
742     VecSet(&zero,mat->lvec);
743     ierr = VecPipelineBegin(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
744                                                  mat->Mvctx); CHKERRQ(ierr);
745     for ( i=0; i<m; i++ ) {
746       n    = diag[i] - A->i[i];
747       idx  = A->j + A->i[i] + shift;
748       v    = A->a + A->i[i] + shift;
749       sum  = t[i];
750       SPARSEDENSEMDOT(sum,ts,v,idx,n);
751       d    = fshift + A->a[diag[i]+shift];
752       n    = B->i[i+1] - B->i[i];
753       idx  = B->j + B->i[i] + shift;
754       v    = B->a + B->i[i] + shift;
755       SPARSEDENSEMDOT(sum,ls,v,idx,n);
756       t[i] = omega*(sum/d);
757     }
758     ierr = VecPipelineEnd(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
759                                                     mat->Mvctx); CHKERRQ(ierr);
760     /*  x = x + t */
761     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
762     VecDestroy(tt);
763     return 0;
764   }
765 
766 
767   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
768     if (flag & SOR_ZERO_INITIAL_GUESS) {
769       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
770     }
771     else {
772       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
773       CHKERRQ(ierr);
774       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
775       CHKERRQ(ierr);
776     }
777     while (its--) {
778       /* go down through the rows */
779       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
780                               mat->Mvctx); CHKERRQ(ierr);
781       for ( i=0; i<m; i++ ) {
782         n    = A->i[i+1] - A->i[i];
783         idx  = A->j + A->i[i] + shift;
784         v    = A->a + A->i[i] + shift;
785         sum  = b[i];
786         SPARSEDENSEMDOT(sum,xs,v,idx,n);
787         d    = fshift + A->a[diag[i]+shift];
788         n    = B->i[i+1] - B->i[i];
789         idx  = B->j + B->i[i] + shift;
790         v    = B->a + B->i[i] + shift;
791         SPARSEDENSEMDOT(sum,ls,v,idx,n);
792         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
793       }
794       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
795                             mat->Mvctx); CHKERRQ(ierr);
796       /* come up through the rows */
797       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
798                               mat->Mvctx); CHKERRQ(ierr);
799       for ( i=m-1; i>-1; i-- ) {
800         n    = A->i[i+1] - A->i[i];
801         idx  = A->j + A->i[i] + shift;
802         v    = A->a + A->i[i] + shift;
803         sum  = b[i];
804         SPARSEDENSEMDOT(sum,xs,v,idx,n);
805         d    = fshift + A->a[diag[i]+shift];
806         n    = B->i[i+1] - B->i[i];
807         idx  = B->j + B->i[i] + shift;
808         v    = B->a + B->i[i] + shift;
809         SPARSEDENSEMDOT(sum,ls,v,idx,n);
810         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
811       }
812       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
813                             mat->Mvctx); CHKERRQ(ierr);
814     }
815   }
816   else if (flag & SOR_FORWARD_SWEEP){
817     if (flag & SOR_ZERO_INITIAL_GUESS) {
818       VecSet(&zero,mat->lvec);
819       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
820                               mat->Mvctx); CHKERRQ(ierr);
821       for ( i=0; i<m; i++ ) {
822         n    = diag[i] - A->i[i];
823         idx  = A->j + A->i[i] + shift;
824         v    = A->a + A->i[i] + shift;
825         sum  = b[i];
826         SPARSEDENSEMDOT(sum,xs,v,idx,n);
827         d    = fshift + A->a[diag[i]+shift];
828         n    = B->i[i+1] - B->i[i];
829         idx  = B->j + B->i[i] + shift;
830         v    = B->a + B->i[i] + shift;
831         SPARSEDENSEMDOT(sum,ls,v,idx,n);
832         x[i] = omega*(sum/d);
833       }
834       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
835                             mat->Mvctx); CHKERRQ(ierr);
836       its--;
837     }
838     while (its--) {
839       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
840       CHKERRQ(ierr);
841       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
842       CHKERRQ(ierr);
843       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
844                               mat->Mvctx); CHKERRQ(ierr);
845       for ( i=0; i<m; i++ ) {
846         n    = A->i[i+1] - A->i[i];
847         idx  = A->j + A->i[i] + shift;
848         v    = A->a + A->i[i] + shift;
849         sum  = b[i];
850         SPARSEDENSEMDOT(sum,xs,v,idx,n);
851         d    = fshift + A->a[diag[i]+shift];
852         n    = B->i[i+1] - B->i[i];
853         idx  = B->j + B->i[i] + shift;
854         v    = B->a + B->i[i] + shift;
855         SPARSEDENSEMDOT(sum,ls,v,idx,n);
856         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
857       }
858       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
859                             mat->Mvctx); CHKERRQ(ierr);
860     }
861   }
862   else if (flag & SOR_BACKWARD_SWEEP){
863     if (flag & SOR_ZERO_INITIAL_GUESS) {
864       VecSet(&zero,mat->lvec);
865       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
866                               mat->Mvctx); CHKERRQ(ierr);
867       for ( i=m-1; i>-1; i-- ) {
868         n    = A->i[i+1] - diag[i] - 1;
869         idx  = A->j + diag[i] + !shift;
870         v    = A->a + diag[i] + !shift;
871         sum  = b[i];
872         SPARSEDENSEMDOT(sum,xs,v,idx,n);
873         d    = fshift + A->a[diag[i]+shift];
874         n    = B->i[i+1] - B->i[i];
875         idx  = B->j + B->i[i] + shift;
876         v    = B->a + B->i[i] + shift;
877         SPARSEDENSEMDOT(sum,ls,v,idx,n);
878         x[i] = omega*(sum/d);
879       }
880       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
881                             mat->Mvctx); CHKERRQ(ierr);
882       its--;
883     }
884     while (its--) {
885       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
886                             mat->Mvctx); CHKERRQ(ierr);
887       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
888                             mat->Mvctx); CHKERRQ(ierr);
889       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
890                               mat->Mvctx); CHKERRQ(ierr);
891       for ( i=m-1; i>-1; i-- ) {
892         n    = A->i[i+1] - A->i[i];
893         idx  = A->j + A->i[i] + shift;
894         v    = A->a + A->i[i] + shift;
895         sum  = b[i];
896         SPARSEDENSEMDOT(sum,xs,v,idx,n);
897         d    = fshift + A->a[diag[i]+shift];
898         n    = B->i[i+1] - B->i[i];
899         idx  = B->j + B->i[i] + shift;
900         v    = B->a + B->i[i] + shift;
901         SPARSEDENSEMDOT(sum,ls,v,idx,n);
902         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
903       }
904       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
905                             mat->Mvctx); CHKERRQ(ierr);
906     }
907   }
908   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
909     if (flag & SOR_ZERO_INITIAL_GUESS) {
910       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
911     }
912     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
913     CHKERRQ(ierr);
914     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
915     CHKERRQ(ierr);
916     while (its--) {
917       /* go down through the rows */
918       for ( i=0; i<m; i++ ) {
919         n    = A->i[i+1] - A->i[i];
920         idx  = A->j + A->i[i] + shift;
921         v    = A->a + A->i[i] + shift;
922         sum  = b[i];
923         SPARSEDENSEMDOT(sum,xs,v,idx,n);
924         d    = fshift + A->a[diag[i]+shift];
925         n    = B->i[i+1] - B->i[i];
926         idx  = B->j + B->i[i] + shift;
927         v    = B->a + B->i[i] + shift;
928         SPARSEDENSEMDOT(sum,ls,v,idx,n);
929         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
930       }
931       /* come up through the rows */
932       for ( i=m-1; i>-1; i-- ) {
933         n    = A->i[i+1] - A->i[i];
934         idx  = A->j + A->i[i] + shift;
935         v    = A->a + A->i[i] + shift;
936         sum  = b[i];
937         SPARSEDENSEMDOT(sum,xs,v,idx,n);
938         d    = fshift + A->a[diag[i]+shift];
939         n    = B->i[i+1] - B->i[i];
940         idx  = B->j + B->i[i] + shift;
941         v    = B->a + B->i[i] + shift;
942         SPARSEDENSEMDOT(sum,ls,v,idx,n);
943         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
944       }
945     }
946   }
947   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
948     if (flag & SOR_ZERO_INITIAL_GUESS) {
949       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
950     }
951     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
952     CHKERRQ(ierr);
953     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
954     CHKERRQ(ierr);
955     while (its--) {
956       for ( i=0; i<m; i++ ) {
957         n    = A->i[i+1] - A->i[i];
958         idx  = A->j + A->i[i] + shift;
959         v    = A->a + A->i[i] + shift;
960         sum  = b[i];
961         SPARSEDENSEMDOT(sum,xs,v,idx,n);
962         d    = fshift + A->a[diag[i]+shift];
963         n    = B->i[i+1] - B->i[i];
964         idx  = B->j + B->i[i] + shift;
965         v    = B->a + B->i[i] + shift;
966         SPARSEDENSEMDOT(sum,ls,v,idx,n);
967         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
968       }
969     }
970   }
971   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
972     if (flag & SOR_ZERO_INITIAL_GUESS) {
973       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
974     }
975     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
976                             mat->Mvctx); CHKERRQ(ierr);
977     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
978                             mat->Mvctx); CHKERRQ(ierr);
979     while (its--) {
980       for ( i=m-1; i>-1; i-- ) {
981         n    = A->i[i+1] - A->i[i];
982         idx  = A->j + A->i[i] + shift;
983         v    = A->a + A->i[i] + shift;
984         sum  = b[i];
985         SPARSEDENSEMDOT(sum,xs,v,idx,n);
986         d    = fshift + A->a[diag[i]+shift];
987         n    = B->i[i+1] - B->i[i];
988         idx  = B->j + B->i[i] + shift;
989         v    = B->a + B->i[i] + shift;
990         SPARSEDENSEMDOT(sum,ls,v,idx,n);
991         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
992       }
993     }
994   }
995   return 0;
996 }
997 
998 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
999                              int *nzalloc,int *mem)
1000 {
1001   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1002   Mat        A = mat->A, B = mat->B;
1003   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
1004 
1005   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
1006   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
1007   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
1008   if (flag == MAT_LOCAL) {
1009     *nz = isend[0]; *nzalloc = isend[1]; *mem = isend[2];
1010   } else if (flag == MAT_GLOBAL_MAX) {
1011     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_MAX,matin->comm);
1012     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
1013   } else if (flag == MAT_GLOBAL_SUM) {
1014     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_SUM,matin->comm);
1015     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
1016   }
1017   return 0;
1018 }
1019 
1020 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
1021 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
1022 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
1023 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
1024 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
1025 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1026 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
1027 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1028 
1029 static int MatSetOption_MPIAIJ(Mat A,MatOption op)
1030 {
1031   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1032 
1033   if (op == NO_NEW_NONZERO_LOCATIONS ||
1034       op == YES_NEW_NONZERO_LOCATIONS ||
1035       op == COLUMNS_SORTED ||
1036       op == ROW_ORIENTED) {
1037         MatSetOption(a->A,op);
1038         MatSetOption(a->B,op);
1039   }
1040   else if (op == ROWS_SORTED ||
1041            op == SYMMETRIC_MATRIX ||
1042            op == STRUCTURALLY_SYMMETRIC_MATRIX ||
1043            op == YES_NEW_DIAGONALS)
1044     PLogInfo((PetscObject)A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1045   else if (op == COLUMN_ORIENTED)
1046     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:COLUMN_ORIENTED");}
1047   else if (op == NO_NEW_DIAGONALS)
1048     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:NO_NEW_DIAGONALS");}
1049   else
1050     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:unknown option");}
1051   return 0;
1052 }
1053 
1054 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1055 {
1056   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1057   *m = mat->M; *n = mat->N;
1058   return 0;
1059 }
1060 
1061 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1062 {
1063   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1064   *m = mat->m; *n = mat->N;
1065   return 0;
1066 }
1067 
1068 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1069 {
1070   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1071   *m = mat->rstart; *n = mat->rend;
1072   return 0;
1073 }
1074 
1075 static int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1076 {
1077   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1078   Scalar     *vworkA, *vworkB, **pvA, **pvB;
1079   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1080   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1081 
1082   if (!mat->assembled) SETERRQ(1,"MatGetRow_MPIAIJ:Must assemble matrix");
1083   if (row < rstart || row >= rend) SETERRQ(1,"MatGetRow_MPIAIJ:Only local rows")
1084   lrow = row - rstart;
1085 
1086   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1087   if (!v)   {pvA = 0; pvB = 0;}
1088   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1089   ierr = MatGetRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1090   ierr = MatGetRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1091   nztot = nzA + nzB;
1092 
1093   if (v  || idx) {
1094     if (nztot) {
1095       /* Sort by increasing column numbers, assuming A and B already sorted */
1096       int imark;
1097       if (mat->assembled) {
1098         for (i=0; i<nzB; i++) cworkB[i] = mat->garray[cworkB[i]];
1099       }
1100       if (v) {
1101         *v = (Scalar *) PetscMalloc( (nztot)*sizeof(Scalar) ); CHKPTRQ(*v);
1102         for ( i=0; i<nzB; i++ ) {
1103           if (cworkB[i] < cstart)   (*v)[i] = vworkB[i];
1104           else break;
1105         }
1106         imark = i;
1107         for ( i=0; i<nzA; i++ )     (*v)[imark+i] = vworkA[i];
1108         for ( i=imark; i<nzB; i++ ) (*v)[nzA+i] = vworkB[i];
1109       }
1110       if (idx) {
1111         *idx = (int *) PetscMalloc( (nztot)*sizeof(int) ); CHKPTRQ(*idx);
1112         for (i=0; i<nzA; i++) cworkA[i] += cstart;
1113         for ( i=0; i<nzB; i++ ) {
1114           if (cworkB[i] < cstart)   (*idx)[i] = cworkB[i];
1115           else break;
1116         }
1117         imark = i;
1118         for ( i=0; i<nzA; i++ )     (*idx)[imark+i] = cworkA[i];
1119         for ( i=imark; i<nzB; i++ ) (*idx)[nzA+i] = cworkB[i];
1120       }
1121     }
1122     else {*idx = 0; *v=0;}
1123   }
1124   *nz = nztot;
1125   ierr = MatRestoreRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1126   ierr = MatRestoreRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1127   return 0;
1128 }
1129 
1130 static int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1131 {
1132   if (idx) PetscFree(*idx);
1133   if (v) PetscFree(*v);
1134   return 0;
1135 }
1136 
1137 static int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1138 {
1139   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1140   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1141   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1142   double     sum = 0.0;
1143   Scalar     *v;
1144 
1145   if (!aij->assembled) SETERRQ(1,"MatNorm_MPIAIJ:Must assemble matrix");
1146   if (aij->size == 1) {
1147     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1148   } else {
1149     if (type == NORM_FROBENIUS) {
1150       v = amat->a;
1151       for (i=0; i<amat->nz; i++ ) {
1152 #if defined(PETSC_COMPLEX)
1153         sum += real(conj(*v)*(*v)); v++;
1154 #else
1155         sum += (*v)*(*v); v++;
1156 #endif
1157       }
1158       v = bmat->a;
1159       for (i=0; i<bmat->nz; i++ ) {
1160 #if defined(PETSC_COMPLEX)
1161         sum += real(conj(*v)*(*v)); v++;
1162 #else
1163         sum += (*v)*(*v); v++;
1164 #endif
1165       }
1166       MPI_Allreduce((void*)&sum,(void*)norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1167       *norm = sqrt(*norm);
1168     }
1169     else if (type == NORM_1) { /* max column norm */
1170       double *tmp, *tmp2;
1171       int    *jj, *garray = aij->garray;
1172       tmp  = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp);
1173       tmp2 = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp2);
1174       PetscMemzero(tmp,aij->N*sizeof(double));
1175       *norm = 0.0;
1176       v = amat->a; jj = amat->j;
1177       for ( j=0; j<amat->nz; j++ ) {
1178         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1179       }
1180       v = bmat->a; jj = bmat->j;
1181       for ( j=0; j<bmat->nz; j++ ) {
1182         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1183       }
1184       MPI_Allreduce((void*)tmp,(void*)tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1185       for ( j=0; j<aij->N; j++ ) {
1186         if (tmp2[j] > *norm) *norm = tmp2[j];
1187       }
1188       PetscFree(tmp); PetscFree(tmp2);
1189     }
1190     else if (type == NORM_INFINITY) { /* max row norm */
1191       double ntemp = 0.0;
1192       for ( j=0; j<amat->m; j++ ) {
1193         v = amat->a + amat->i[j] + shift;
1194         sum = 0.0;
1195         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1196           sum += PetscAbsScalar(*v); v++;
1197         }
1198         v = bmat->a + bmat->i[j] + shift;
1199         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1200           sum += PetscAbsScalar(*v); v++;
1201         }
1202         if (sum > ntemp) ntemp = sum;
1203       }
1204       MPI_Allreduce((void*)&ntemp,(void*)norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1205     }
1206     else {
1207       SETERRQ(1,"MatNorm_MPIAIJ:No support for two norm");
1208     }
1209   }
1210   return 0;
1211 }
1212 
1213 static int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1214 {
1215   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1216   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1217   int        ierr,shift = Aloc->indexshift;
1218   Mat        B;
1219   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1220   Scalar     *array;
1221 
1222   if (!matout && M != N) SETERRQ(1,"MatTranspose_MPIAIJ:Square only in-place");
1223   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PetscNull,0,
1224          PetscNull,&B); CHKERRQ(ierr);
1225 
1226   /* copy over the A part */
1227   Aloc = (Mat_SeqAIJ*) a->A->data;
1228   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1229   row = a->rstart;
1230   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1231   for ( i=0; i<m; i++ ) {
1232     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1233     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1234   }
1235   aj = Aloc->j;
1236   for ( i=0; i<ai[m]|+shift; i++ ) {aj[i] -= a->cstart + shift;}
1237 
1238   /* copy over the B part */
1239   Aloc = (Mat_SeqAIJ*) a->B->data;
1240   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1241   row = a->rstart;
1242   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1243   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1244   for ( i=0; i<m; i++ ) {
1245     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1246     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1247   }
1248   PetscFree(ct);
1249   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1250   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1251   if (matout) {
1252     *matout = B;
1253   } else {
1254     /* This isn't really an in-place transpose .... but free data structures from a */
1255     PetscFree(a->rowners);
1256     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1257     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1258     if (a->colmap) PetscFree(a->colmap);
1259     if (a->garray) PetscFree(a->garray);
1260     if (a->lvec) VecDestroy(a->lvec);
1261     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1262     PetscFree(a);
1263     PetscMemcpy(A,B,sizeof(struct _Mat));
1264     PetscHeaderDestroy(B);
1265   }
1266   return 0;
1267 }
1268 
1269 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1270 static int MatCopyPrivate_MPIAIJ(Mat,Mat *,int);
1271 
1272 /* -------------------------------------------------------------------*/
1273 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1274        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1275        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1276        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1277        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1278        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1279        MatLUFactor_MPIAIJ,0,
1280        MatRelax_MPIAIJ,
1281        MatTranspose_MPIAIJ,
1282        MatGetInfo_MPIAIJ,0,
1283        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1284        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1285        0,
1286        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1287        MatGetReordering_MPIAIJ,
1288        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1289        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1290        MatILUFactorSymbolic_MPIAIJ,0,
1291        0,0,MatConvert_MPIAIJ,0,0,MatCopyPrivate_MPIAIJ,0,0,
1292        0,0,0,
1293        0,0,MatGetValues_MPIAIJ};
1294 
1295 /*@C
1296    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1297    (the default parallel PETSc format).
1298 
1299    Input Parameters:
1300 .  comm - MPI communicator
1301 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1302 .  n - number of local columns (or PETSC_DECIDE to have calculated
1303            if N is given)
1304 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1305 .  N - number of global columns (or PETSC_DECIDE to have calculated
1306            if n is given)
1307 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1308            (same for all local rows)
1309 .  d_nzz - number of nonzeros per row in diagonal portion of local submatrix
1310            or null (possibly different for each row).  You must leave room
1311            for the diagonal entry even if it is zero.
1312 .  o_nz - number of nonzeros per row in off-diagonal portion of local
1313            submatrix (same for all local rows).
1314 .  o_nzz - number of nonzeros per row in off-diagonal portion of local
1315            submatrix or null (possibly different for each row).
1316 
1317    Output Parameter:
1318 .  newmat - the matrix
1319 
1320    Notes:
1321    The AIJ format (also called the Yale sparse matrix format or
1322    compressed row storage), is fully compatible with standard Fortran 77
1323    storage.  That is, the stored row and column indices can begin at
1324    either one (as in Fortran) or zero.  See the users manual for details.
1325 
1326    The user MUST specify either the local or global matrix dimensions
1327    (possibly both).
1328 
1329    Storage Information:
1330    For a square global matrix we define each processor's diagonal portion
1331    to be its local rows and the corresponding columns (a square submatrix);
1332    each processor's off-diagonal portion encompasses the remainder of the
1333    local matrix (a rectangular submatrix).
1334 
1335    The user can specify preallocated storage for the diagonal part of
1336    the local submatrix with either d_nz or d_nnz (not both).  Set d_nz=0
1337    and d_nnz=PetscNull for PETSc to control dynamic memory allocation.
1338    Likewise, specify preallocated storage for the off-diagonal part of
1339    the local submatrix with o_nz or o_nnz (not both).
1340 
1341    By default, this format uses inodes (identical nodes) when possible.
1342    We search for consecutive rows with the same nonzero structure, thereby
1343    reusing matrix information to achieve increased efficiency.
1344 
1345    Options Database Keys:
1346 $    -mat_aij_no_inode  - Do not use inodes
1347 $    -mat_aij_inode_limit <limit> - Set inode limit (max limit=5)
1348 
1349 .keywords: matrix, aij, compressed row, sparse, parallel
1350 
1351 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1352 @*/
1353 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1354                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *newmat)
1355 {
1356   Mat          mat;
1357   Mat_MPIAIJ   *a;
1358   int          ierr, i,sum[2],work[2];
1359 
1360   *newmat         = 0;
1361   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1362   PLogObjectCreate(mat);
1363   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1364   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1365   mat->destroy    = MatDestroy_MPIAIJ;
1366   mat->view       = MatView_MPIAIJ;
1367   mat->factor     = 0;
1368 
1369   a->insertmode = NOT_SET_VALUES;
1370   MPI_Comm_rank(comm,&a->rank);
1371   MPI_Comm_size(comm,&a->size);
1372 
1373   if (m == PETSC_DECIDE && (d_nnz != PetscNull || o_nnz != PetscNull))
1374     SETERRQ(1,"MatCreateMPIAIJ:Cannot have PETSc decide rows but set d_nnz or o_nnz");
1375 
1376   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1377     work[0] = m; work[1] = n;
1378     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1379     if (M == PETSC_DECIDE) M = sum[0];
1380     if (N == PETSC_DECIDE) N = sum[1];
1381   }
1382   if (m == PETSC_DECIDE) {m = M/a->size + ((M % a->size) > a->rank);}
1383   if (n == PETSC_DECIDE) {n = N/a->size + ((N % a->size) > a->rank);}
1384   a->m = m;
1385   a->n = n;
1386   a->N = N;
1387   a->M = M;
1388 
1389   /* build local table of row and column ownerships */
1390   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1391   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1392   a->cowners = a->rowners + a->size + 1;
1393   MPI_Allgather(&m,1,MPI_INT,a->rowners+1,1,MPI_INT,comm);
1394   a->rowners[0] = 0;
1395   for ( i=2; i<=a->size; i++ ) {
1396     a->rowners[i] += a->rowners[i-1];
1397   }
1398   a->rstart = a->rowners[a->rank];
1399   a->rend   = a->rowners[a->rank+1];
1400   MPI_Allgather(&n,1,MPI_INT,a->cowners+1,1,MPI_INT,comm);
1401   a->cowners[0] = 0;
1402   for ( i=2; i<=a->size; i++ ) {
1403     a->cowners[i] += a->cowners[i-1];
1404   }
1405   a->cstart = a->cowners[a->rank];
1406   a->cend   = a->cowners[a->rank+1];
1407 
1408   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&a->A); CHKERRQ(ierr);
1409   PLogObjectParent(mat,a->A);
1410   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&a->B); CHKERRQ(ierr);
1411   PLogObjectParent(mat,a->B);
1412 
1413   /* build cache for off array entries formed */
1414   ierr = StashBuild_Private(&a->stash); CHKERRQ(ierr);
1415   a->colmap    = 0;
1416   a->garray    = 0;
1417 
1418   /* stuff used for matrix vector multiply */
1419   a->lvec      = 0;
1420   a->Mvctx     = 0;
1421   a->assembled = 0;
1422 
1423   *newmat = mat;
1424   return 0;
1425 }
1426 
1427 static int MatCopyPrivate_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1428 {
1429   Mat        mat;
1430   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1431   int        ierr, len;
1432 
1433   if (!oldmat->assembled) SETERRQ(1,"MatCopyPrivate_MPIAIJ:Must assemble matrix");
1434   *newmat       = 0;
1435   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1436   PLogObjectCreate(mat);
1437   mat->data     = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1438   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1439   mat->destroy  = MatDestroy_MPIAIJ;
1440   mat->view     = MatView_MPIAIJ;
1441   mat->factor   = matin->factor;
1442 
1443   a->m          = oldmat->m;
1444   a->n          = oldmat->n;
1445   a->M          = oldmat->M;
1446   a->N          = oldmat->N;
1447 
1448   a->assembled  = 1;
1449   a->rstart     = oldmat->rstart;
1450   a->rend       = oldmat->rend;
1451   a->cstart     = oldmat->cstart;
1452   a->cend       = oldmat->cend;
1453   a->size       = oldmat->size;
1454   a->rank       = oldmat->rank;
1455   a->insertmode = NOT_SET_VALUES;
1456 
1457   a->rowners = (int *) PetscMalloc((a->size+1)*sizeof(int)); CHKPTRQ(a->rowners);
1458   PLogObjectMemory(mat,(a->size+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1459   PetscMemcpy(a->rowners,oldmat->rowners,(a->size+1)*sizeof(int));
1460   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1461   if (oldmat->colmap) {
1462     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1463     PLogObjectMemory(mat,(a->N)*sizeof(int));
1464     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1465   } else a->colmap = 0;
1466   if (oldmat->garray && (len = ((Mat_SeqAIJ *) (oldmat->B->data))->n)) {
1467     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1468     PLogObjectMemory(mat,len*sizeof(int));
1469     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1470   } else a->garray = 0;
1471 
1472   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1473   PLogObjectParent(mat,a->lvec);
1474   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1475   PLogObjectParent(mat,a->Mvctx);
1476   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1477   PLogObjectParent(mat,a->A);
1478   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1479   PLogObjectParent(mat,a->B);
1480   *newmat = mat;
1481   return 0;
1482 }
1483 
1484 #include "sysio.h"
1485 
1486 int MatLoad_MPIAIJorMPIRow(Viewer bview,MatType type,Mat *newmat)
1487 {
1488   Mat          A;
1489   int          i, nz, ierr, j,rstart, rend, fd;
1490   Scalar       *vals,*svals;
1491   PetscObject  vobj = (PetscObject) bview;
1492   MPI_Comm     comm = vobj->comm;
1493   MPI_Status   status;
1494   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1495   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1496   int          tag = ((PetscObject)bview)->tag;
1497 
1498   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1499   if (!rank) {
1500     ierr = ViewerFileGetDescriptor_Private(bview,&fd); CHKERRQ(ierr);
1501     ierr = SYRead(fd,(char *)header,4,SYINT); CHKERRQ(ierr);
1502     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJorMPIRow:not matrix object");
1503   }
1504 
1505   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1506   M = header[1]; N = header[2];
1507   /* determine ownership of all rows */
1508   m = M/size + ((M % size) > rank);
1509   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1510   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1511   rowners[0] = 0;
1512   for ( i=2; i<=size; i++ ) {
1513     rowners[i] += rowners[i-1];
1514   }
1515   rstart = rowners[rank];
1516   rend   = rowners[rank+1];
1517 
1518   /* distribute row lengths to all processors */
1519   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1520   offlens = ourlens + (rend-rstart);
1521   if (!rank) {
1522     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1523     ierr = SYRead(fd,rowlengths,M,SYINT); CHKERRQ(ierr);
1524     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1525     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1526     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1527     PetscFree(sndcounts);
1528   }
1529   else {
1530     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1531   }
1532 
1533   if (!rank) {
1534     /* calculate the number of nonzeros on each processor */
1535     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1536     PetscMemzero(procsnz,size*sizeof(int));
1537     for ( i=0; i<size; i++ ) {
1538       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1539         procsnz[i] += rowlengths[j];
1540       }
1541     }
1542     PetscFree(rowlengths);
1543 
1544     /* determine max buffer needed and allocate it */
1545     maxnz = 0;
1546     for ( i=0; i<size; i++ ) {
1547       maxnz = PetscMax(maxnz,procsnz[i]);
1548     }
1549     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1550 
1551     /* read in my part of the matrix column indices  */
1552     nz = procsnz[0];
1553     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1554     ierr = SYRead(fd,mycols,nz,SYINT); CHKERRQ(ierr);
1555 
1556     /* read in every one elses and ship off */
1557     for ( i=1; i<size; i++ ) {
1558       nz = procsnz[i];
1559       ierr = SYRead(fd,cols,nz,SYINT); CHKERRQ(ierr);
1560       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1561     }
1562     PetscFree(cols);
1563   }
1564   else {
1565     /* determine buffer space needed for message */
1566     nz = 0;
1567     for ( i=0; i<m; i++ ) {
1568       nz += ourlens[i];
1569     }
1570     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1571 
1572     /* receive message of column indices*/
1573     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1574     MPI_Get_count(&status,MPI_INT,&maxnz);
1575     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJorMPIRow:something is wrong with file");
1576   }
1577 
1578   /* loop over local rows, determining number of off diagonal entries */
1579   PetscMemzero(offlens,m*sizeof(int));
1580   jj = 0;
1581   for ( i=0; i<m; i++ ) {
1582     for ( j=0; j<ourlens[i]; j++ ) {
1583       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1584       jj++;
1585     }
1586   }
1587 
1588   /* create our matrix */
1589   for ( i=0; i<m; i++ ) {
1590     ourlens[i] -= offlens[i];
1591   }
1592   if (type == MATMPIAIJ) {
1593     ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1594   }
1595   else if (type == MATMPIROW) {
1596     ierr = MatCreateMPIRow(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1597   }
1598   A = *newmat;
1599   MatSetOption(A,COLUMNS_SORTED);
1600   for ( i=0; i<m; i++ ) {
1601     ourlens[i] += offlens[i];
1602   }
1603 
1604   if (!rank) {
1605     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1606 
1607     /* read in my part of the matrix numerical values  */
1608     nz = procsnz[0];
1609     ierr = SYRead(fd,vals,nz,SYSCALAR); CHKERRQ(ierr);
1610 
1611     /* insert into matrix */
1612     jj      = rstart;
1613     smycols = mycols;
1614     svals   = vals;
1615     for ( i=0; i<m; i++ ) {
1616       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1617       smycols += ourlens[i];
1618       svals   += ourlens[i];
1619       jj++;
1620     }
1621 
1622     /* read in other processors and ship out */
1623     for ( i=1; i<size; i++ ) {
1624       nz = procsnz[i];
1625       ierr = SYRead(fd,vals,nz,SYSCALAR); CHKERRQ(ierr);
1626       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1627     }
1628     PetscFree(procsnz);
1629   }
1630   else {
1631     /* receive numeric values */
1632     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1633 
1634     /* receive message of values*/
1635     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1636     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1637     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJorMPIRow:something is wrong with file");
1638 
1639     /* insert into matrix */
1640     jj      = rstart;
1641     smycols = mycols;
1642     svals   = vals;
1643     for ( i=0; i<m; i++ ) {
1644       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1645       smycols += ourlens[i];
1646       svals   += ourlens[i];
1647       jj++;
1648     }
1649   }
1650   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1651 
1652   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1653   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1654   return 0;
1655 }
1656