xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision d3e5ee8800a1fc69869e95d48ef2feb107aaa57d)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.72 1995/09/06 03:05:29 bsmith Exp curfman $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "vec/vecimpl.h"
7 #include "inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_AIJ    *B = (Mat_AIJ*) aij->B->data;
18   int        n = B->n,i;
19   aij->colmap = (int *) PETSCMALLOC(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
20   PLogObjectMemory(mat,aij->N*sizeof(int));
21   PETSCMEMSET(aij->colmap,0,aij->N*sizeof(int));
22   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i+1;
23   return 0;
24 }
25 
26 extern int DisAssemble_MPIAIJ(Mat);
27 
28 static int MatGetReordering_MPIAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
29 {
30   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
31   int ierr;
32   if (aij->numtids == 1) {
33     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
34   } else
35     SETERRQ(1,"MatGetReordering_MPIAIJ:  not yet supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,
40                             int *idxn,Scalar *v,InsertMode addv)
41 {
42   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
43   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
44   int        cstart = aij->cstart, cend = aij->cend,row,col;
45 
46   if (aij->insertmode != NOTSETVALUES && aij->insertmode != addv) {
47     SETERRQ(1,"MatSetValues_MPIAIJ:You cannot mix inserts and adds");
48   }
49   aij->insertmode = addv;
50   for ( i=0; i<m; i++ ) {
51     if (idxm[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
52     if (idxm[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
53     if (idxm[i] >= rstart && idxm[i] < rend) {
54       row = idxm[i] - rstart;
55       for ( j=0; j<n; j++ ) {
56         if (idxn[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
57         if (idxn[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
58         if (idxn[j] >= cstart && idxn[j] < cend){
59           col = idxn[j] - cstart;
60           ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERRQ(ierr);
61         }
62         else {
63           if (aij->assembled) {
64             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
65             col = aij->colmap[idxn[j]] - 1;
66             if (col < 0 && !((Mat_AIJ*)(aij->A->data))->nonew) {
67               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
68               col =  idxn[j];
69             }
70           }
71           else col = idxn[j];
72           ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERRQ(ierr);
73         }
74       }
75     }
76     else {
77       ierr = StashValues_Private(&aij->stash,idxm[i],n,idxn,v+i*n,addv);
78       CHKERRQ(ierr);
79     }
80   }
81   return 0;
82 }
83 
84 /*
85     the assembly code is a lot like the code for vectors, we should
86     sometime derive a single assembly code that can be used for
87     either case.
88 */
89 
90 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
91 {
92   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
93   MPI_Comm    comm = mat->comm;
94   int         numtids = aij->numtids, *owners = aij->rowners;
95   int         mytid = aij->mytid;
96   MPI_Request *send_waits,*recv_waits;
97   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
98   int         tag = mat->tag, *owner,*starts,count,ierr;
99   InsertMode  addv;
100   Scalar      *rvalues,*svalues;
101 
102   /* make sure all processors are either in INSERTMODE or ADDMODE */
103   MPI_Allreduce((void *) &aij->insertmode,(void *) &addv,1,MPI_INT,
104                 MPI_BOR,comm);
105   if (addv == (ADDVALUES|INSERTVALUES)) {
106     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
107   }
108   aij->insertmode = addv; /* in case this processor had no cache */
109 
110   /*  first count number of contributors to each processor */
111   nprocs = (int *) PETSCMALLOC( 2*numtids*sizeof(int) ); CHKPTRQ(nprocs);
112   PETSCMEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
113   owner = (int *) PETSCMALLOC( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
114   for ( i=0; i<aij->stash.n; i++ ) {
115     idx = aij->stash.idx[i];
116     for ( j=0; j<numtids; j++ ) {
117       if (idx >= owners[j] && idx < owners[j+1]) {
118         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
119       }
120     }
121   }
122   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
123 
124   /* inform other processors of number of messages and max length*/
125   work = (int *) PETSCMALLOC( numtids*sizeof(int) ); CHKPTRQ(work);
126   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
127   nreceives = work[mytid];
128   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
129   nmax = work[mytid];
130   PETSCFREE(work);
131 
132   /* post receives:
133        1) each message will consist of ordered pairs
134      (global index,value) we store the global index as a double
135      to simplify the message passing.
136        2) since we don't know how long each individual message is we
137      allocate the largest needed buffer for each receive. Potentially
138      this is a lot of wasted space.
139 
140 
141        This could be done better.
142   */
143   rvalues = (Scalar *) PETSCMALLOC(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
144   CHKPTRQ(rvalues);
145   recv_waits = (MPI_Request *) PETSCMALLOC((nreceives+1)*sizeof(MPI_Request));
146   CHKPTRQ(recv_waits);
147   for ( i=0; i<nreceives; i++ ) {
148     MPI_Irecv((void *)(rvalues+3*nmax*i),3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
149               comm,recv_waits+i);
150   }
151 
152   /* do sends:
153       1) starts[i] gives the starting index in svalues for stuff going to
154          the ith processor
155   */
156   svalues = (Scalar *) PETSCMALLOC( 3*(aij->stash.n+1)*sizeof(Scalar) );
157   CHKPTRQ(svalues);
158   send_waits = (MPI_Request *) PETSCMALLOC( (nsends+1)*sizeof(MPI_Request));
159   CHKPTRQ(send_waits);
160   starts = (int *) PETSCMALLOC( numtids*sizeof(int) ); CHKPTRQ(starts);
161   starts[0] = 0;
162   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
163   for ( i=0; i<aij->stash.n; i++ ) {
164     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
165     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
166     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
167   }
168   PETSCFREE(owner);
169   starts[0] = 0;
170   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
171   count = 0;
172   for ( i=0; i<numtids; i++ ) {
173     if (procs[i]) {
174       MPI_Isend((void*)(svalues+3*starts[i]),3*nprocs[i],MPIU_SCALAR,i,tag,
175                 comm,send_waits+count++);
176     }
177   }
178   PETSCFREE(starts); PETSCFREE(nprocs);
179 
180   /* Free cache space */
181   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
182 
183   aij->svalues    = svalues;    aij->rvalues = rvalues;
184   aij->nsends     = nsends;     aij->nrecvs = nreceives;
185   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
186   aij->rmax       = nmax;
187 
188   return 0;
189 }
190 extern int MatSetUpMultiply_MPIAIJ(Mat);
191 
192 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
193 {
194   int        ierr;
195   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
196   MPI_Status  *send_status,recv_status;
197   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n;
198   int         row,col,other_disassembled;
199   Scalar      *values,val;
200   InsertMode  addv = aij->insertmode;
201 
202   /*  wait on receives */
203   while (count) {
204     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
205     /* unpack receives into our local space */
206     values = aij->rvalues + 3*imdex*aij->rmax;
207     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
208     n = n/3;
209     for ( i=0; i<n; i++ ) {
210       row = (int) PETSCREAL(values[3*i]) - aij->rstart;
211       col = (int) PETSCREAL(values[3*i+1]);
212       val = values[3*i+2];
213       if (col >= aij->cstart && col < aij->cend) {
214           col -= aij->cstart;
215         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
216       }
217       else {
218         if (aij->assembled) {
219           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
220           col = aij->colmap[col] - 1;
221           if (col < 0  && !((Mat_AIJ*)(aij->A->data))->nonew) {
222             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
223             col = (int) PETSCREAL(values[3*i+1]);
224           }
225         }
226         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
227       }
228     }
229     count--;
230   }
231   PETSCFREE(aij->recv_waits); PETSCFREE(aij->rvalues);
232 
233   /* wait on sends */
234   if (aij->nsends) {
235     send_status = (MPI_Status *) PETSCMALLOC( aij->nsends*sizeof(MPI_Status) );
236     CHKPTRQ(send_status);
237     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
238     PETSCFREE(send_status);
239   }
240   PETSCFREE(aij->send_waits); PETSCFREE(aij->svalues);
241 
242   aij->insertmode = NOTSETVALUES;
243   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
244   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
245 
246   /* determine if any processor has disassembled, if so we must
247      also disassemble ourselfs, in order that we may reassemble. */
248   MPI_Allreduce((void *) &aij->assembled,(void *) &other_disassembled,1,
249                  MPI_INT,MPI_PROD,mat->comm);
250   if (aij->assembled && !other_disassembled) {
251     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
252   }
253 
254   if (!aij->assembled && mode == FINAL_ASSEMBLY) {
255     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
256     aij->assembled = 1;
257   }
258   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
259   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
260 
261   return 0;
262 }
263 
264 static int MatZeroEntries_MPIAIJ(Mat A)
265 {
266   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
267   int ierr;
268   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
269   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
270   return 0;
271 }
272 
273 /* again this uses the same basic stratagy as in the assembly and
274    scatter create routines, we should try to do it systemamatically
275    if we can figure out the proper level of generality. */
276 
277 /* the code does not do the diagonal entries correctly unless the
278    matrix is square and the column and row owerships are identical.
279    This is a BUG. The only way to fix it seems to be to access
280    aij->A and aij->B directly and not through the MatZeroRows()
281    routine.
282 */
283 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
284 {
285   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
286   int            i,ierr,N, *rows,*owners = l->rowners,numtids = l->numtids;
287   int            *procs,*nprocs,j,found,idx,nsends,*work;
288   int            nmax,*svalues,*starts,*owner,nrecvs,mytid = l->mytid;
289   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
290   int            *lens,imdex,*lrows,*values;
291   MPI_Comm       comm = A->comm;
292   MPI_Request    *send_waits,*recv_waits;
293   MPI_Status     recv_status,*send_status;
294   IS             istmp;
295 
296   if (!l->assembled)
297     SETERRQ(1,"MatZeroRows_MPIAIJ:Must assemble matrix first");
298   ierr = ISGetLocalSize(is,&N); CHKERRQ(ierr);
299   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
300 
301   /*  first count number of contributors to each processor */
302   nprocs = (int *) PETSCMALLOC( 2*numtids*sizeof(int) ); CHKPTRQ(nprocs);
303   PETSCMEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
304   owner = (int *) PETSCMALLOC((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
305   for ( i=0; i<N; i++ ) {
306     idx = rows[i];
307     found = 0;
308     for ( j=0; j<numtids; j++ ) {
309       if (idx >= owners[j] && idx < owners[j+1]) {
310         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
311       }
312     }
313     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
314   }
315   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
316 
317   /* inform other processors of number of messages and max length*/
318   work = (int *) PETSCMALLOC( numtids*sizeof(int) ); CHKPTRQ(work);
319   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
320   nrecvs = work[mytid];
321   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
322   nmax = work[mytid];
323   PETSCFREE(work);
324 
325   /* post receives:   */
326   rvalues = (int *) PETSCMALLOC((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
327   CHKPTRQ(rvalues);
328   recv_waits = (MPI_Request *) PETSCMALLOC((nrecvs+1)*sizeof(MPI_Request));
329   CHKPTRQ(recv_waits);
330   for ( i=0; i<nrecvs; i++ ) {
331     MPI_Irecv((void *)(rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,
332               comm,recv_waits+i);
333   }
334 
335   /* do sends:
336       1) starts[i] gives the starting index in svalues for stuff going to
337          the ith processor
338   */
339   svalues = (int *) PETSCMALLOC( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
340   send_waits = (MPI_Request *) PETSCMALLOC( (nsends+1)*sizeof(MPI_Request));
341   CHKPTRQ(send_waits);
342   starts = (int *) PETSCMALLOC( (numtids+1)*sizeof(int) ); CHKPTRQ(starts);
343   starts[0] = 0;
344   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
345   for ( i=0; i<N; i++ ) {
346     svalues[starts[owner[i]]++] = rows[i];
347   }
348   ISRestoreIndices(is,&rows);
349 
350   starts[0] = 0;
351   for ( i=1; i<numtids+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
352   count = 0;
353   for ( i=0; i<numtids; i++ ) {
354     if (procs[i]) {
355       MPI_Isend((void*)(svalues+starts[i]),nprocs[i],MPI_INT,i,tag,
356                 comm,send_waits+count++);
357     }
358   }
359   PETSCFREE(starts);
360 
361   base = owners[mytid];
362 
363   /*  wait on receives */
364   lens = (int *) PETSCMALLOC( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
365   source = lens + nrecvs;
366   count = nrecvs; slen = 0;
367   while (count) {
368     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
369     /* unpack receives into our local space */
370     MPI_Get_count(&recv_status,MPI_INT,&n);
371     source[imdex]  = recv_status.MPI_SOURCE;
372     lens[imdex]  = n;
373     slen += n;
374     count--;
375   }
376   PETSCFREE(recv_waits);
377 
378   /* move the data into the send scatter */
379   lrows = (int *) PETSCMALLOC( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
380   count = 0;
381   for ( i=0; i<nrecvs; i++ ) {
382     values = rvalues + i*nmax;
383     for ( j=0; j<lens[i]; j++ ) {
384       lrows[count++] = values[j] - base;
385     }
386   }
387   PETSCFREE(rvalues); PETSCFREE(lens);
388   PETSCFREE(owner); PETSCFREE(nprocs);
389 
390   /* actually zap the local rows */
391   ierr = ISCreateSequential(MPI_COMM_SELF,slen,lrows,&istmp);
392   PLogObjectParent(A,istmp);
393   CHKERRQ(ierr);  PETSCFREE(lrows);
394   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
395   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
396   ierr = ISDestroy(istmp); CHKERRQ(ierr);
397 
398   /* wait on sends */
399   if (nsends) {
400     send_status = (MPI_Status *) PETSCMALLOC( nsends*sizeof(MPI_Status) );
401     CHKPTRQ(send_status);
402     MPI_Waitall(nsends,send_waits,send_status);
403     PETSCFREE(send_status);
404   }
405   PETSCFREE(send_waits); PETSCFREE(svalues);
406 
407 
408   return 0;
409 }
410 
411 static int MatMult_MPIAIJ(Mat aijin,Vec xx,Vec yy)
412 {
413   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
414   int        ierr;
415   if (!aij->assembled) SETERRQ(1,"MatMult_MPIAIJ:must assemble matrix first");
416   ierr = VecScatterBegin(xx,aij->lvec,INSERTVALUES,SCATTERALL,aij->Mvctx);
417   CHKERRQ(ierr);
418   ierr = MatMult(aij->A,xx,yy); CHKERRQ(ierr);
419   ierr = VecScatterEnd(xx,aij->lvec,INSERTVALUES,SCATTERALL,aij->Mvctx);
420   CHKERRQ(ierr);
421   ierr = MatMultAdd(aij->B,aij->lvec,yy,yy); CHKERRQ(ierr);
422   return 0;
423 }
424 
425 static int MatMultAdd_MPIAIJ(Mat aijin,Vec xx,Vec yy,Vec zz)
426 {
427   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
428   int        ierr;
429   if (!aij->assembled) SETERRQ(1,"MatMult_MPIAIJ:must assemble matrix first");
430   ierr = VecScatterBegin(xx,aij->lvec,INSERTVALUES,SCATTERALL,aij->Mvctx);
431   CHKERRQ(ierr);
432   ierr = MatMultAdd(aij->A,xx,yy,zz); CHKERRQ(ierr);
433   ierr = VecScatterEnd(xx,aij->lvec,INSERTVALUES,SCATTERALL,aij->Mvctx);
434   CHKERRQ(ierr);
435   ierr = MatMultAdd(aij->B,aij->lvec,zz,zz); CHKERRQ(ierr);
436   return 0;
437 }
438 
439 static int MatMultTrans_MPIAIJ(Mat aijin,Vec xx,Vec yy)
440 {
441   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
442   int        ierr;
443 
444   if (!aij->assembled)
445     SETERRQ(1,"MatMulTrans_MPIAIJ:must assemble matrix first");
446   /* do nondiagonal part */
447   ierr = MatMultTrans(aij->B,xx,aij->lvec); CHKERRQ(ierr);
448   /* send it on its way */
449   ierr = VecScatterBegin(aij->lvec,yy,ADDVALUES,
450            (ScatterMode)(SCATTERALL|SCATTERREVERSE),aij->Mvctx); CHKERRQ(ierr);
451   /* do local part */
452   ierr = MatMultTrans(aij->A,xx,yy); CHKERRQ(ierr);
453   /* receive remote parts: note this assumes the values are not actually */
454   /* inserted in yy until the next line, which is true for my implementation*/
455   /* but is not perhaps always true. */
456   ierr = VecScatterEnd(aij->lvec,yy,ADDVALUES,
457          (ScatterMode)(SCATTERALL|SCATTERREVERSE),aij->Mvctx); CHKERRQ(ierr);
458   return 0;
459 }
460 
461 static int MatMultTransAdd_MPIAIJ(Mat aijin,Vec xx,Vec yy,Vec zz)
462 {
463   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
464   int        ierr;
465 
466   if (!aij->assembled)
467     SETERRQ(1,"MatMulTransAdd_MPIAIJ:must assemble matrix first");
468   /* do nondiagonal part */
469   ierr = MatMultTrans(aij->B,xx,aij->lvec); CHKERRQ(ierr);
470   /* send it on its way */
471   ierr = VecScatterBegin(aij->lvec,zz,ADDVALUES,
472          (ScatterMode)(SCATTERALL|SCATTERREVERSE),aij->Mvctx); CHKERRQ(ierr);
473   /* do local part */
474   ierr = MatMultTransAdd(aij->A,xx,yy,zz); CHKERRQ(ierr);
475   /* receive remote parts: note this assumes the values are not actually */
476   /* inserted in yy until the next line, which is true for my implementation*/
477   /* but is not perhaps always true. */
478   ierr = VecScatterEnd(aij->lvec,zz,ADDVALUES,
479        (ScatterMode)(SCATTERALL|SCATTERREVERSE),aij->Mvctx); CHKERRQ(ierr);
480   return 0;
481 }
482 
483 /*
484   This only works correctly for square matrices where the subblock A->A is the
485    diagonal block
486 */
487 static int MatGetDiagonal_MPIAIJ(Mat Ain,Vec v)
488 {
489   Mat_MPIAIJ *A = (Mat_MPIAIJ *) Ain->data;
490   if (!A->assembled) SETERRQ(1,"MatGetDiag_MPIAIJ:must assemble matrix first");
491   return MatGetDiagonal(A->A,v);
492 }
493 
494 static int MatDestroy_MPIAIJ(PetscObject obj)
495 {
496   Mat        mat = (Mat) obj;
497   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
498   int        ierr;
499 #if defined(PETSC_LOG)
500   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
501 #endif
502   PETSCFREE(aij->rowners);
503   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
504   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
505   if (aij->colmap) PETSCFREE(aij->colmap);
506   if (aij->garray) PETSCFREE(aij->garray);
507   if (aij->lvec) VecDestroy(aij->lvec);
508   if (aij->Mvctx) VecScatterCtxDestroy(aij->Mvctx);
509   PETSCFREE(aij);
510   PLogObjectDestroy(mat);
511   PETSCHEADERDESTROY(mat);
512   return 0;
513 }
514 #include "draw.h"
515 #include "pinclude/pviewer.h"
516 
517 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
518 {
519   Mat         mat = (Mat) obj;
520   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
521   int         ierr, format;
522   PetscObject vobj = (PetscObject) viewer;
523 
524   if (!aij->assembled) SETERRQ(1,"MatView_MPIAIJ:must assemble matrix first");
525   if (!viewer) { /* so that viewers may be used from debuggers */
526     viewer = STDOUT_VIEWER_SELF; vobj = (PetscObject) viewer;
527   }
528   if (vobj->cookie == DRAW_COOKIE && vobj->type == NULLWINDOW) return 0;
529   ierr = ViewerFileGetFormat_Private(viewer,&format);
530   if (vobj->cookie == VIEWER_COOKIE && format == FILE_FORMAT_INFO &&
531      (vobj->type == FILE_VIEWER || vobj->type == FILES_VIEWER)) {
532    /* do nothing for now */
533   }
534   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == FILE_VIEWER) {
535     FILE *fd;
536     ierr = ViewerFileGetPointer_Private(viewer,&fd); CHKERRQ(ierr);
537     MPIU_Seq_begin(mat->comm,1);
538     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
539            aij->mytid,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
540            aij->cend);
541     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
542     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
543     fflush(fd);
544     MPIU_Seq_end(mat->comm,1);
545   }
546   else if ((vobj->cookie == VIEWER_COOKIE && vobj->type == FILES_VIEWER) ||
547             vobj->cookie == DRAW_COOKIE) {
548     int numtids = aij->numtids, mytid = aij->mytid;
549     if (numtids == 1) {
550       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
551     }
552     else {
553       /* assemble the entire matrix onto first processor. */
554       Mat     A;
555       Mat_AIJ *Aloc;
556       int     M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
557       Scalar  *a;
558 
559       if (!mytid) {
560         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,0,0,0,&A);
561       }
562       else {
563         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,0,0,0,&A);
564       }
565       PLogObjectParent(mat,A);
566       CHKERRQ(ierr);
567 
568       /* copy over the A part */
569       Aloc = (Mat_AIJ*) aij->A->data;
570       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
571       row = aij->rstart;
572       for ( i=0; i<ai[m]-1; i++ ) {aj[i] += aij->cstart - 1;}
573       for ( i=0; i<m; i++ ) {
574         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERTVALUES);
575         CHKERRQ(ierr);
576         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
577       }
578       aj = Aloc->j;
579       for ( i=0; i<ai[m]-1; i++ ) {aj[i] -= aij->cstart - 1;}
580 
581       /* copy over the B part */
582       Aloc = (Mat_AIJ*) aij->B->data;
583       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
584       row = aij->rstart;
585       ct = cols = (int *) PETSCMALLOC( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
586       for ( i=0; i<ai[m]-1; i++ ) {cols[i] = aij->garray[aj[i]-1];}
587       for ( i=0; i<m; i++ ) {
588         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERTVALUES);
589         CHKERRQ(ierr);
590         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
591       }
592       PETSCFREE(ct);
593       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
594       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
595       if (!mytid) {
596         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
597       }
598       ierr = MatDestroy(A); CHKERRQ(ierr);
599     }
600   }
601   return 0;
602 }
603 
604 extern int MatMarkDiag_AIJ(Mat);
605 /*
606     This has to provide several versions.
607 
608      1) per sequential
609      2) a) use only local smoothing updating outer values only once.
610         b) local smoothing updating outer values each inner iteration
611      3) color updating out values betwen colors.
612 */
613 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
614                            double shift,int its,Vec xx)
615 {
616   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
617   Mat        AA = mat->A, BB = mat->B;
618   Mat_AIJ    *A = (Mat_AIJ *) AA->data, *B = (Mat_AIJ *)BB->data;
619   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
620   int        ierr,*idx, *diag;
621   int        n = mat->n, m = mat->m, i;
622   Vec        tt;
623 
624   if (!mat->assembled) SETERRQ(1,"MatRelax_MPIAIJ:must assemble matrix first");
625 
626   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
627   xs = x -1; /* shift by one for index start of 1 */
628   ls--;
629   if (!A->diag) {if ((ierr = MatMarkDiag_AIJ(AA))) return ierr;}
630   diag = A->diag;
631   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
632     SETERRQ(1,"MatRelax_MPIAIJ:Option not yet supported");
633   }
634   if (flag & SOR_EISENSTAT) {
635     /* Let  A = L + U + D; where L is lower trianglar,
636     U is upper triangular, E is diagonal; This routine applies
637 
638             (L + E)^{-1} A (U + E)^{-1}
639 
640     to a vector efficiently using Eisenstat's trick. This is for
641     the case of SSOR preconditioner, so E is D/omega where omega
642     is the relaxation factor.
643     */
644     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
645     PLogObjectParent(matin,tt);
646     VecGetArray(tt,&t);
647     scale = (2.0/omega) - 1.0;
648     /*  x = (E + U)^{-1} b */
649     VecSet(&zero,mat->lvec);
650     ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
651                               mat->Mvctx); CHKERRQ(ierr);
652     for ( i=m-1; i>-1; i-- ) {
653       n    = A->i[i+1] - diag[i] - 1;
654       idx  = A->j + diag[i];
655       v    = A->a + diag[i];
656       sum  = b[i];
657       SPARSEDENSEMDOT(sum,xs,v,idx,n);
658       d    = shift + A->a[diag[i]-1];
659       n    = B->i[i+1] - B->i[i];
660       idx  = B->j + B->i[i] - 1;
661       v    = B->a + B->i[i] - 1;
662       SPARSEDENSEMDOT(sum,ls,v,idx,n);
663       x[i] = omega*(sum/d);
664     }
665     ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
666                             mat->Mvctx); CHKERRQ(ierr);
667 
668     /*  t = b - (2*E - D)x */
669     v = A->a;
670     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ - 1])*x[i]; }
671 
672     /*  t = (E + L)^{-1}t */
673     ts = t - 1; /* shifted by one for index start of a or mat->j*/
674     diag = A->diag;
675     VecSet(&zero,mat->lvec);
676     ierr = VecPipelineBegin(tt,mat->lvec,INSERTVALUES,PIPELINEDOWN,
677                                                  mat->Mvctx); CHKERRQ(ierr);
678     for ( i=0; i<m; i++ ) {
679       n    = diag[i] - A->i[i];
680       idx  = A->j + A->i[i] - 1;
681       v    = A->a + A->i[i] - 1;
682       sum  = t[i];
683       SPARSEDENSEMDOT(sum,ts,v,idx,n);
684       d    = shift + A->a[diag[i]-1];
685       n    = B->i[i+1] - B->i[i];
686       idx  = B->j + B->i[i] - 1;
687       v    = B->a + B->i[i] - 1;
688       SPARSEDENSEMDOT(sum,ls,v,idx,n);
689       t[i] = omega*(sum/d);
690     }
691     ierr = VecPipelineEnd(tt,mat->lvec,INSERTVALUES,PIPELINEDOWN,
692                                                     mat->Mvctx); CHKERRQ(ierr);
693     /*  x = x + t */
694     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
695     VecDestroy(tt);
696     return 0;
697   }
698 
699 
700   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
701     if (flag & SOR_ZERO_INITIAL_GUESS) {
702       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
703     }
704     else {
705       ierr=VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERUP,mat->Mvctx);
706       CHKERRQ(ierr);
707       ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERUP,mat->Mvctx);
708       CHKERRQ(ierr);
709     }
710     while (its--) {
711       /* go down through the rows */
712       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
713                               mat->Mvctx); CHKERRQ(ierr);
714       for ( i=0; i<m; i++ ) {
715         n    = A->i[i+1] - A->i[i];
716         idx  = A->j + A->i[i] - 1;
717         v    = A->a + A->i[i] - 1;
718         sum  = b[i];
719         SPARSEDENSEMDOT(sum,xs,v,idx,n);
720         d    = shift + A->a[diag[i]-1];
721         n    = B->i[i+1] - B->i[i];
722         idx  = B->j + B->i[i] - 1;
723         v    = B->a + B->i[i] - 1;
724         SPARSEDENSEMDOT(sum,ls,v,idx,n);
725         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
726       }
727       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
728                             mat->Mvctx); CHKERRQ(ierr);
729       /* come up through the rows */
730       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
731                               mat->Mvctx); CHKERRQ(ierr);
732       for ( i=m-1; i>-1; i-- ) {
733         n    = A->i[i+1] - A->i[i];
734         idx  = A->j + A->i[i] - 1;
735         v    = A->a + A->i[i] - 1;
736         sum  = b[i];
737         SPARSEDENSEMDOT(sum,xs,v,idx,n);
738         d    = shift + A->a[diag[i]-1];
739         n    = B->i[i+1] - B->i[i];
740         idx  = B->j + B->i[i] - 1;
741         v    = B->a + B->i[i] - 1;
742         SPARSEDENSEMDOT(sum,ls,v,idx,n);
743         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
744       }
745       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
746                             mat->Mvctx); CHKERRQ(ierr);
747     }
748   }
749   else if (flag & SOR_FORWARD_SWEEP){
750     if (flag & SOR_ZERO_INITIAL_GUESS) {
751       VecSet(&zero,mat->lvec);
752       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
753                               mat->Mvctx); CHKERRQ(ierr);
754       for ( i=0; i<m; i++ ) {
755         n    = diag[i] - A->i[i];
756         idx  = A->j + A->i[i] - 1;
757         v    = A->a + A->i[i] - 1;
758         sum  = b[i];
759         SPARSEDENSEMDOT(sum,xs,v,idx,n);
760         d    = shift + A->a[diag[i]-1];
761         n    = B->i[i+1] - B->i[i];
762         idx  = B->j + B->i[i] - 1;
763         v    = B->a + B->i[i] - 1;
764         SPARSEDENSEMDOT(sum,ls,v,idx,n);
765         x[i] = omega*(sum/d);
766       }
767       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
768                             mat->Mvctx); CHKERRQ(ierr);
769       its--;
770     }
771     while (its--) {
772       ierr=VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERUP,mat->Mvctx);
773       CHKERRQ(ierr);
774       ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERUP,mat->Mvctx);
775       CHKERRQ(ierr);
776       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
777                               mat->Mvctx); CHKERRQ(ierr);
778       for ( i=0; i<m; i++ ) {
779         n    = A->i[i+1] - A->i[i];
780         idx  = A->j + A->i[i] - 1;
781         v    = A->a + A->i[i] - 1;
782         sum  = b[i];
783         SPARSEDENSEMDOT(sum,xs,v,idx,n);
784         d    = shift + A->a[diag[i]-1];
785         n    = B->i[i+1] - B->i[i];
786         idx  = B->j + B->i[i] - 1;
787         v    = B->a + B->i[i] - 1;
788         SPARSEDENSEMDOT(sum,ls,v,idx,n);
789         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
790       }
791       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEDOWN,
792                             mat->Mvctx); CHKERRQ(ierr);
793     }
794   }
795   else if (flag & SOR_BACKWARD_SWEEP){
796     if (flag & SOR_ZERO_INITIAL_GUESS) {
797       VecSet(&zero,mat->lvec);
798       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
799                               mat->Mvctx); CHKERRQ(ierr);
800       for ( i=m-1; i>-1; i-- ) {
801         n    = A->i[i+1] - diag[i] - 1;
802         idx  = A->j + diag[i];
803         v    = A->a + diag[i];
804         sum  = b[i];
805         SPARSEDENSEMDOT(sum,xs,v,idx,n);
806         d    = shift + A->a[diag[i]-1];
807         n    = B->i[i+1] - B->i[i];
808         idx  = B->j + B->i[i] - 1;
809         v    = B->a + B->i[i] - 1;
810         SPARSEDENSEMDOT(sum,ls,v,idx,n);
811         x[i] = omega*(sum/d);
812       }
813       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
814                             mat->Mvctx); CHKERRQ(ierr);
815       its--;
816     }
817     while (its--) {
818       ierr = VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERDOWN,
819                             mat->Mvctx); CHKERRQ(ierr);
820       ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERDOWN,
821                             mat->Mvctx); CHKERRQ(ierr);
822       ierr = VecPipelineBegin(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
823                               mat->Mvctx); CHKERRQ(ierr);
824       for ( i=m-1; i>-1; i-- ) {
825         n    = A->i[i+1] - A->i[i];
826         idx  = A->j + A->i[i] - 1;
827         v    = A->a + A->i[i] - 1;
828         sum  = b[i];
829         SPARSEDENSEMDOT(sum,xs,v,idx,n);
830         d    = shift + A->a[diag[i]-1];
831         n    = B->i[i+1] - B->i[i];
832         idx  = B->j + B->i[i] - 1;
833         v    = B->a + B->i[i] - 1;
834         SPARSEDENSEMDOT(sum,ls,v,idx,n);
835         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
836       }
837       ierr = VecPipelineEnd(xx,mat->lvec,INSERTVALUES,PIPELINEUP,
838                             mat->Mvctx); CHKERRQ(ierr);
839     }
840   }
841   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
842     if (flag & SOR_ZERO_INITIAL_GUESS) {
843       return MatRelax(mat->A,bb,omega,flag,shift,its,xx);
844     }
845     ierr=VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERALL,mat->Mvctx);
846     CHKERRQ(ierr);
847     ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERALL,mat->Mvctx);
848     CHKERRQ(ierr);
849     while (its--) {
850       /* go down through the rows */
851       for ( i=0; i<m; i++ ) {
852         n    = A->i[i+1] - A->i[i];
853         idx  = A->j + A->i[i] - 1;
854         v    = A->a + A->i[i] - 1;
855         sum  = b[i];
856         SPARSEDENSEMDOT(sum,xs,v,idx,n);
857         d    = shift + A->a[diag[i]-1];
858         n    = B->i[i+1] - B->i[i];
859         idx  = B->j + B->i[i] - 1;
860         v    = B->a + B->i[i] - 1;
861         SPARSEDENSEMDOT(sum,ls,v,idx,n);
862         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
863       }
864       /* come up through the rows */
865       for ( i=m-1; i>-1; i-- ) {
866         n    = A->i[i+1] - A->i[i];
867         idx  = A->j + A->i[i] - 1;
868         v    = A->a + A->i[i] - 1;
869         sum  = b[i];
870         SPARSEDENSEMDOT(sum,xs,v,idx,n);
871         d    = shift + A->a[diag[i]-1];
872         n    = B->i[i+1] - B->i[i];
873         idx  = B->j + B->i[i] - 1;
874         v    = B->a + B->i[i] - 1;
875         SPARSEDENSEMDOT(sum,ls,v,idx,n);
876         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
877       }
878     }
879   }
880   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
881     if (flag & SOR_ZERO_INITIAL_GUESS) {
882       return MatRelax(mat->A,bb,omega,flag,shift,its,xx);
883     }
884     ierr=VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERALL,mat->Mvctx);
885     CHKERRQ(ierr);
886     ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERALL,mat->Mvctx);
887     CHKERRQ(ierr);
888     while (its--) {
889       for ( i=0; i<m; i++ ) {
890         n    = A->i[i+1] - A->i[i];
891         idx  = A->j + A->i[i] - 1;
892         v    = A->a + A->i[i] - 1;
893         sum  = b[i];
894         SPARSEDENSEMDOT(sum,xs,v,idx,n);
895         d    = shift + A->a[diag[i]-1];
896         n    = B->i[i+1] - B->i[i];
897         idx  = B->j + B->i[i] - 1;
898         v    = B->a + B->i[i] - 1;
899         SPARSEDENSEMDOT(sum,ls,v,idx,n);
900         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
901       }
902     }
903   }
904   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
905     if (flag & SOR_ZERO_INITIAL_GUESS) {
906       return MatRelax(mat->A,bb,omega,flag,shift,its,xx);
907     }
908     ierr = VecScatterBegin(xx,mat->lvec,INSERTVALUES,SCATTERALL,
909                             mat->Mvctx); CHKERRQ(ierr);
910     ierr = VecScatterEnd(xx,mat->lvec,INSERTVALUES,SCATTERALL,
911                             mat->Mvctx); CHKERRQ(ierr);
912     while (its--) {
913       for ( i=m-1; i>-1; i-- ) {
914         n    = A->i[i+1] - A->i[i];
915         idx  = A->j + A->i[i] - 1;
916         v    = A->a + A->i[i] - 1;
917         sum  = b[i];
918         SPARSEDENSEMDOT(sum,xs,v,idx,n);
919         d    = shift + A->a[diag[i]-1];
920         n    = B->i[i+1] - B->i[i];
921         idx  = B->j + B->i[i] - 1;
922         v    = B->a + B->i[i] - 1;
923         SPARSEDENSEMDOT(sum,ls,v,idx,n);
924         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
925       }
926     }
927   }
928   return 0;
929 }
930 
931 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
932                              int *nzalloc,int *mem)
933 {
934   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
935   Mat        A = mat->A, B = mat->B;
936   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
937 
938   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
939   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
940   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
941   if (flag == MAT_LOCAL) {
942     *nz = isend[0]; *nzalloc = isend[1]; *mem = isend[2];
943   } else if (flag == MAT_GLOBAL_MAX) {
944     MPI_Allreduce((void *) isend,(void *) irecv,3,MPI_INT,MPI_MAX,matin->comm);
945     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
946   } else if (flag == MAT_GLOBAL_SUM) {
947     MPI_Allreduce((void *) isend,(void *) irecv,3,MPI_INT,MPI_SUM,matin->comm);
948     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
949   }
950   return 0;
951 }
952 
953 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
954 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
955 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
956 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
957 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
958 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
959 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
960 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
961 
962 static int MatSetOption_MPIAIJ(Mat aijin,MatOption op)
963 {
964   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) aijin->data;
965 
966   if      (op == NO_NEW_NONZERO_LOCATIONS)  {
967     MatSetOption(aij->A,op);
968     MatSetOption(aij->B,op);
969   }
970   else if (op == YES_NEW_NONZERO_LOCATIONS) {
971     MatSetOption(aij->A,op);
972     MatSetOption(aij->B,op);
973   }
974   else if (op == COLUMN_ORIENTED)
975     SETERRQ(1,"MatSetOption_MPIAIJ:Column oriented not supported");
976   return 0;
977 }
978 
979 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
980 {
981   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
982   *m = mat->M; *n = mat->N;
983   return 0;
984 }
985 
986 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
987 {
988   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
989   *m = mat->m; *n = mat->N;
990   return 0;
991 }
992 
993 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
994 {
995   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
996   *m = mat->rstart; *n = mat->rend;
997   return 0;
998 }
999 
1000 static int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1001 {
1002   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1003   Scalar     *vworkA, *vworkB, **pvA, **pvB;
1004   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1005   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1006 
1007   if (row < rstart || row >= rend)
1008     SETERRQ(1,"MatGetRow_MPIAIJ:Currently you can get only local rows")
1009   lrow = row - rstart;
1010 
1011   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1012   if (!v)   {pvA = 0; pvB = 0;}
1013   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1014   ierr = MatGetRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1015   ierr = MatGetRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1016   nztot = nzA + nzB;
1017 
1018   if (v  || idx) {
1019     if (nztot) {
1020       /* Sort by increasing column numbers, assuming A and B already sorted */
1021       int imark;
1022       if (mat->assembled) {
1023         for (i=0; i<nzB; i++) cworkB[i] = mat->garray[cworkB[i]];
1024       }
1025       if (v) {
1026         *v = (Scalar *) PETSCMALLOC( (nztot)*sizeof(Scalar) ); CHKPTRQ(*v);
1027         for ( i=0; i<nzB; i++ ) {
1028           if (cworkB[i] < cstart)   (*v)[i] = vworkB[i];
1029           else break;
1030         }
1031         imark = i;
1032         for ( i=0; i<nzA; i++ )     (*v)[imark+i] = vworkA[i];
1033         for ( i=imark; i<nzB; i++ ) (*v)[nzA+i] = vworkB[i];
1034       }
1035       if (idx) {
1036         *idx = (int *) PETSCMALLOC( (nztot)*sizeof(int) ); CHKPTRQ(*idx);
1037         for (i=0; i<nzA; i++) cworkA[i] += cstart;
1038         for ( i=0; i<nzB; i++ ) {
1039           if (cworkB[i] < cstart)   (*idx)[i] = cworkB[i];
1040           else break;
1041         }
1042         imark = i;
1043         for ( i=0; i<nzA; i++ )     (*idx)[imark+i] = cworkA[i];
1044         for ( i=imark; i<nzB; i++ ) (*idx)[nzA+i] = cworkB[i];
1045       }
1046     }
1047     else {*idx = 0; *v=0;}
1048   }
1049   *nz = nztot;
1050   ierr = MatRestoreRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1051   ierr = MatRestoreRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1052   return 0;
1053 }
1054 
1055 static int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1056 {
1057   if (idx) PETSCFREE(*idx);
1058   if (v) PETSCFREE(*v);
1059   return 0;
1060 }
1061 
1062 static int MatNorm_MPIAIJ(Mat mat,MatNormType type,double *norm)
1063 {
1064   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1065   int        ierr, i, j, rstart = aij->rstart;
1066   double     sum = 0.0;
1067   Mat_AIJ    *amat = (Mat_AIJ*) aij->A->data, *bmat = (Mat_AIJ*) aij->B->data;
1068   Scalar     *v;
1069 
1070   if (!aij->assembled)
1071     SETERRQ(1,"MatNorm_MPIAIJ: Cannot compute norm of unassembled matrix");
1072   if (aij->numtids == 1) {
1073     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1074   } else {
1075     if (type == NORM_FROBENIUS) {
1076       v = amat->a;
1077       for (i=0; i<amat->nz; i++ ) {
1078 #if defined(PETSC_COMPLEX)
1079         sum += real(conj(*v)*(*v)); v++;
1080 #else
1081         sum += (*v)*(*v); v++;
1082 #endif
1083       }
1084       v = bmat->a;
1085       for (i=0; i<bmat->nz; i++ ) {
1086 #if defined(PETSC_COMPLEX)
1087         sum += real(conj(*v)*(*v)); v++;
1088 #else
1089         sum += (*v)*(*v); v++;
1090 #endif
1091       }
1092       MPI_Allreduce((void*)&sum,(void*)norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1093       *norm = sqrt(*norm);
1094     }
1095     else if (type == NORM_1) { /* max column norm */
1096       double *tmp, *tmp2;
1097       int    *jj, *garray = aij->garray;
1098       tmp = (double *) PETSCMALLOC( aij->N*sizeof(double) ); CHKPTRQ(tmp);
1099       tmp2 = (double *) PETSCMALLOC( aij->N*sizeof(double) ); CHKPTRQ(tmp2);
1100       PETSCMEMSET(tmp,0,aij->N*sizeof(double));
1101       *norm = 0.0;
1102       v = amat->a; jj = amat->j;
1103       for ( j=0; j<amat->nz; j++ ) {
1104 #if defined(PETSC_COMPLEX)
1105         tmp[rstart + *jj++ - 1] += abs(*v++);
1106 #else
1107         tmp[rstart + *jj++ - 1] += fabs(*v++);
1108 #endif
1109       }
1110       v = bmat->a; jj = bmat->j;
1111       for ( j=0; j<bmat->nz; j++ ) {
1112 #if defined(PETSC_COMPLEX)
1113         tmp[garray[*jj++ - 1]] += abs(*v++);
1114 #else
1115         tmp[garray[*jj++ - 1]] += fabs(*v++);
1116 #endif
1117       }
1118       MPI_Allreduce((void*)tmp,(void*)tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1119       for ( j=0; j<aij->N; j++ ) {
1120         if (tmp2[j] > *norm) *norm = tmp2[j];
1121       }
1122       PETSCFREE(tmp); PETSCFREE(tmp2);
1123     }
1124     else if (type == NORM_INFINITY) { /* max row norm */
1125       double normtemp = 0.0;
1126       for ( j=0; j<amat->m; j++ ) {
1127         v = amat->a + amat->i[j] - 1;
1128         sum = 0.0;
1129         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1130 #if defined(PETSC_COMPLEX)
1131           sum += abs(*v); v++;
1132 #else
1133           sum += fabs(*v); v++;
1134 #endif
1135         }
1136         v = bmat->a + bmat->i[j] - 1;
1137         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1138 #if defined(PETSC_COMPLEX)
1139           sum += abs(*v); v++;
1140 #else
1141           sum += fabs(*v); v++;
1142 #endif
1143         }
1144         if (sum > normtemp) normtemp = sum;
1145         MPI_Allreduce((void*)&normtemp,(void*)norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1146       }
1147     }
1148     else {
1149       SETERRQ(1,"MatNorm_MPIRow:No support for the two norm");
1150     }
1151   }
1152   return 0;
1153 }
1154 
1155 static int MatTranspose_MPIAIJ(Mat A,Mat *Bin)
1156 {
1157   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1158   int        ierr;
1159   Mat        B;
1160   Mat_AIJ    *Aloc;
1161   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1162   Scalar     *array;
1163 
1164   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,0,0,0,&B);
1165   CHKERRQ(ierr);
1166 
1167   /* copy over the A part */
1168   Aloc = (Mat_AIJ*) a->A->data;
1169   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1170   row = a->rstart;
1171   for ( i=0; i<ai[m]-1; i++ ) {aj[i] += a->cstart - 1;}
1172   for ( i=0; i<m; i++ ) {
1173       ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERTVALUES);
1174       CHKERRQ(ierr);
1175       row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1176   }
1177   aj = Aloc->j;
1178   for ( i=0; i<ai[m]-1; i++ ) {aj[i] -= a->cstart - 1;}
1179 
1180   /* copy over the B part */
1181   Aloc = (Mat_AIJ*) a->B->data;
1182   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1183   row = a->rstart;
1184   ct = cols = (int *) PETSCMALLOC( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
1185   for ( i=0; i<ai[m]-1; i++ ) {cols[i] = a->garray[aj[i]-1];}
1186   for ( i=0; i<m; i++ ) {
1187     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERTVALUES);
1188     CHKERRQ(ierr);
1189     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1190   }
1191   PETSCFREE(ct);
1192   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1193   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1194   *Bin = B;
1195   return 0;
1196 }
1197 
1198 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1199 static int MatCopyPrivate_MPIAIJ(Mat,Mat *);
1200 
1201 /* -------------------------------------------------------------------*/
1202 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1203        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1204        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1205        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1206        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1207        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1208        MatLUFactor_MPIAIJ,0,
1209        MatRelax_MPIAIJ,
1210        MatTranspose_MPIAIJ,
1211        MatGetInfo_MPIAIJ,0,
1212        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1213        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1214        0,
1215        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1216        MatGetReordering_MPIAIJ,
1217        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1218        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1219        MatILUFactorSymbolic_MPIAIJ,0,
1220        0,0,MatConvert_MPIAIJ,0,0,MatCopyPrivate_MPIAIJ};
1221 
1222 /*@
1223    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1224    (the default parallel PETSc format).
1225 
1226    Input Parameters:
1227 .  comm - MPI communicator
1228 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1229 .  n - number of local columns (or PETSC_DECIDE to have calculated
1230            if N is given)
1231 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1232 .  N - number of global columns (or PETSC_DECIDE to have calculated
1233            if n is given)
1234 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1235            (same for all local rows)
1236 .  d_nzz - number of nonzeros per row in diagonal portion of local submatrix
1237            or null (possibly different for each row).  You must leave room
1238            for the diagonal entry even if it is zero.
1239 .  o_nz - number of nonzeros per row in off-diagonal portion of local
1240            submatrix (same for all local rows).
1241 .  o_nzz - number of nonzeros per row in off-diagonal portion of local
1242            submatrix or null (possibly different for each row).
1243 
1244    Output Parameter:
1245 .  newmat - the matrix
1246 
1247    Notes:
1248    The AIJ format (also called the Yale sparse matrix format or
1249    compressed row storage), is fully compatible with standard Fortran 77
1250    storage.  That is, the stored row and column indices begin at
1251    one, not zero.  See the users manual for further details.
1252 
1253    The user MUST specify either the local or global matrix dimensions
1254    (possibly both).
1255 
1256    Storage Information:
1257    For a square global matrix we define each processor's diagonal portion
1258    to be its local rows and the corresponding columns (a square submatrix);
1259    each processor's off-diagonal portion encompasses the remainder of the
1260    local matrix (a rectangular submatrix).
1261 
1262    The user can specify preallocated storage for the diagonal part of
1263    the local submatrix with either d_nz or d_nnz (not both). Set both
1264    d_nz and d_nnz to zero for PETSc to control dynamic memory allocation.
1265    Likewise, specify preallocated storage for the off-diagonal part of
1266    the local submatrix with o_nz or o_nnz (not both).
1267 
1268 .keywords: matrix, aij, compressed row, sparse, parallel
1269 
1270 .seealso: MatCreate(), MatCreateSequentialAIJ(), MatSetValues()
1271 @*/
1272 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1273                  int d_nz,int *d_nnz, int o_nz,int *o_nnz,Mat *newmat)
1274 {
1275   Mat          mat;
1276   Mat_MPIAIJ   *aij;
1277   int          ierr, i,sum[2],work[2];
1278   *newmat         = 0;
1279   PETSCHEADERCREATE(mat,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1280   PLogObjectCreate(mat);
1281   mat->data       = (void *) (aij = PETSCNEW(Mat_MPIAIJ)); CHKPTRQ(aij);
1282   mat->ops        = &MatOps;
1283   mat->destroy    = MatDestroy_MPIAIJ;
1284   mat->view       = MatView_MPIAIJ;
1285   mat->factor     = 0;
1286 
1287   aij->insertmode = NOTSETVALUES;
1288   MPI_Comm_rank(comm,&aij->mytid);
1289   MPI_Comm_size(comm,&aij->numtids);
1290 
1291   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1292     work[0] = m; work[1] = n;
1293     MPI_Allreduce((void *) work,(void *) sum,2,MPI_INT,MPI_SUM,comm );
1294     if (M == PETSC_DECIDE) M = sum[0];
1295     if (N == PETSC_DECIDE) N = sum[1];
1296   }
1297   if (m == PETSC_DECIDE)
1298     {m = M/aij->numtids + ((M % aij->numtids) > aij->mytid);}
1299   if (n == PETSC_DECIDE)
1300     {n = N/aij->numtids + ((N % aij->numtids) > aij->mytid);}
1301   aij->m = m;
1302   aij->n = n;
1303   aij->N = N;
1304   aij->M = M;
1305 
1306   /* build local table of row and column ownerships */
1307   aij->rowners = (int *) PETSCMALLOC(2*(aij->numtids+2)*sizeof(int));
1308   CHKPTRQ(aij->rowners);
1309   PLogObjectMemory(mat,2*(aij->numtids+2)*sizeof(int)+sizeof(struct _Mat)+
1310                        sizeof(Mat_MPIAIJ));
1311   aij->cowners = aij->rowners + aij->numtids + 1;
1312   MPI_Allgather(&m,1,MPI_INT,aij->rowners+1,1,MPI_INT,comm);
1313   aij->rowners[0] = 0;
1314   for ( i=2; i<=aij->numtids; i++ ) {
1315     aij->rowners[i] += aij->rowners[i-1];
1316   }
1317   aij->rstart = aij->rowners[aij->mytid];
1318   aij->rend   = aij->rowners[aij->mytid+1];
1319   MPI_Allgather(&n,1,MPI_INT,aij->cowners+1,1,MPI_INT,comm);
1320   aij->cowners[0] = 0;
1321   for ( i=2; i<=aij->numtids; i++ ) {
1322     aij->cowners[i] += aij->cowners[i-1];
1323   }
1324   aij->cstart = aij->cowners[aij->mytid];
1325   aij->cend   = aij->cowners[aij->mytid+1];
1326 
1327 
1328   ierr = MatCreateSequentialAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&aij->A);
1329   CHKERRQ(ierr);
1330   PLogObjectParent(mat,aij->A);
1331   ierr = MatCreateSequentialAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&aij->B);
1332   CHKERRQ(ierr);
1333   PLogObjectParent(mat,aij->B);
1334 
1335   /* build cache for off array entries formed */
1336   ierr = StashBuild_Private(&aij->stash); CHKERRQ(ierr);
1337   aij->colmap    = 0;
1338   aij->garray    = 0;
1339 
1340   /* stuff used for matrix vector multiply */
1341   aij->lvec      = 0;
1342   aij->Mvctx     = 0;
1343   aij->assembled = 0;
1344 
1345   *newmat = mat;
1346   return 0;
1347 }
1348 
1349 static int MatCopyPrivate_MPIAIJ(Mat matin,Mat *newmat)
1350 {
1351   Mat        mat;
1352   Mat_MPIAIJ *aij,*oldmat = (Mat_MPIAIJ *) matin->data;
1353   int        ierr, len;
1354   *newmat      = 0;
1355 
1356   if (!oldmat->assembled)
1357     SETERRQ(1,"MatCopyPrivate_MPIAIJ:Cannot copy unassembled matrix");
1358   PETSCHEADERCREATE(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1359   PLogObjectCreate(mat);
1360   mat->data       = (void *) (aij = PETSCNEW(Mat_MPIAIJ)); CHKPTRQ(aij);
1361   mat->ops        = &MatOps;
1362   mat->destroy    = MatDestroy_MPIAIJ;
1363   mat->view       = MatView_MPIAIJ;
1364   mat->factor     = matin->factor;
1365 
1366   aij->m          = oldmat->m;
1367   aij->n          = oldmat->n;
1368   aij->M          = oldmat->M;
1369   aij->N          = oldmat->N;
1370 
1371   aij->assembled  = 1;
1372   aij->rstart     = oldmat->rstart;
1373   aij->rend       = oldmat->rend;
1374   aij->cstart     = oldmat->cstart;
1375   aij->cend       = oldmat->cend;
1376   aij->numtids    = oldmat->numtids;
1377   aij->mytid      = oldmat->mytid;
1378   aij->insertmode = NOTSETVALUES;
1379 
1380   aij->rowners        = (int *) PETSCMALLOC( (aij->numtids+1)*sizeof(int) );
1381   CHKPTRQ(aij->rowners);
1382   PLogObjectMemory(mat,(aij->numtids+1)*sizeof(int)+sizeof(struct _Mat)+
1383                        sizeof(Mat_MPIAIJ));
1384   PETSCMEMCPY(aij->rowners,oldmat->rowners,(aij->numtids+1)*sizeof(int));
1385   ierr = StashInitialize_Private(&aij->stash); CHKERRQ(ierr);
1386   if (oldmat->colmap) {
1387     aij->colmap      = (int *) PETSCMALLOC( (aij->N)*sizeof(int) );
1388     CHKPTRQ(aij->colmap);
1389     PLogObjectMemory(mat,(aij->N)*sizeof(int));
1390     PETSCMEMCPY(aij->colmap,oldmat->colmap,(aij->N)*sizeof(int));
1391   } else aij->colmap = 0;
1392   if (oldmat->garray && (len = ((Mat_AIJ *) (oldmat->B->data))->n)) {
1393     aij->garray      = (int *) PETSCMALLOC(len*sizeof(int) ); CHKPTRQ(aij->garray);
1394     PLogObjectMemory(mat,len*sizeof(int));
1395     PETSCMEMCPY(aij->garray,oldmat->garray,len*sizeof(int));
1396   } else aij->garray = 0;
1397 
1398   ierr =  VecDuplicate(oldmat->lvec,&aij->lvec); CHKERRQ(ierr);
1399   PLogObjectParent(mat,aij->lvec);
1400   ierr =  VecScatterCtxCopy(oldmat->Mvctx,&aij->Mvctx); CHKERRQ(ierr);
1401   PLogObjectParent(mat,aij->Mvctx);
1402   ierr =  MatConvert(oldmat->A,MATSAME,&aij->A); CHKERRQ(ierr);
1403   PLogObjectParent(mat,aij->A);
1404   ierr =  MatConvert(oldmat->B,MATSAME,&aij->B); CHKERRQ(ierr);
1405   PLogObjectParent(mat,aij->B);
1406   *newmat = mat;
1407   return 0;
1408 }
1409