xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 1eb62cbb64d64b61ce4c1a3ec8cb7777b3ff11c2)
1 
2 #include "mpiaij.h"
3 #include "vec/vecimpl.h"
4 
5 
6 #define CHUNCKSIZE   100
7 /*
8    This is a simple minded stash. Do a linear search to determine if
9  in stash, if not add to end.
10 */
11 static int StashValues(Stash *stash,int row,int n, int *idxn,
12                        Scalar *values,InsertMode addv)
13 {
14   int    i,j,N = stash->n,found,*n_idx, *n_idy;
15   Scalar val,*n_array;
16 
17   for ( i=0; i<n; i++ ) {
18     found = 0;
19     val = *values++;
20     for ( j=0; j<N; j++ ) {
21       if ( stash->idx[j] == row && stash->idy[j] == idxn[i]) {
22         /* found a match */
23         if (addv == AddValues) stash->array[j] += val;
24         else stash->array[j] = val;
25         found = 1;
26         break;
27       }
28     }
29     if (!found) { /* not found so add to end */
30       if ( stash->n == stash->nmax ) {
31         /* allocate a larger stash */
32         n_array = (Scalar *) MALLOC( (stash->nmax + CHUNCKSIZE)*(
33                                      2*sizeof(int) + sizeof(Scalar)));
34         CHKPTR(n_array);
35         n_idx = (int *) (n_array + stash->nmax + CHUNCKSIZE);
36         n_idy = (int *) (n_idx + stash->nmax + CHUNCKSIZE);
37         MEMCPY(n_array,stash->array,stash->nmax*sizeof(Scalar));
38         MEMCPY(n_idx,stash->idx,stash->nmax*sizeof(int));
39         MEMCPY(n_idy,stash->idy,stash->nmax*sizeof(int));
40         if (stash->array) FREE(stash->array);
41         stash->array = n_array; stash->idx = n_idx; stash->idy = n_idy;
42         stash->nmax += CHUNCKSIZE;
43       }
44       stash->array[stash->n]   = val;
45       stash->idx[stash->n]     = row;
46       stash->idy[stash->n++]   = idxn[i];
47     }
48   }
49   return 0;
50 }
51 
52 static int MatiAIJInsertValues(Mat mat,int m,int *idxm,int n,
53                             int *idxn,Scalar *v,InsertMode addv)
54 {
55   Matimpiaij *aij = (Matimpiaij *) mat->data;
56   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
57   int        cstart = aij->cstart, cend = aij->cend,row,col;
58 
59   if (aij->insertmode != NotSetValues && aij->insertmode != addv) {
60     SETERR(1,"You cannot mix inserts and adds");
61   }
62   aij->insertmode = addv;
63   for ( i=0; i<m; i++ ) {
64     if (idxm[i] >= rstart && idxm[i] < rend) {
65       row = idxm[i] - rstart;
66       for ( j=0; j<n; j++ ) {
67         if (idxn[j] >= cstart && idxn[j] < cend){
68           col = idxn[j] - cstart;
69           ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr);
70         }
71         else {
72           col = idxn[j];
73           ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr);
74         }
75       }
76     }
77     else {
78       ierr = StashValues(&aij->stash,idxm[i],n,idxn,v+i*n,addv);CHKERR(ierr);
79     }
80   }
81   return 0;
82 }
83 
84 /*
85     the assembly code is alot like the code for vectors, we should
86     sometime derive a single assembly code that can be used for
87     either case.
88 */
89 
90 static int MatiAIJBeginAssemble(Mat mat)
91 {
92   Matimpiaij  *aij = (Matimpiaij *) mat->data;
93   MPI_Comm    comm = aij->comm;
94   int         ierr, numtids = aij->numtids, *owners = aij->rowners;
95   int         mytid = aij->mytid;
96   MPI_Request *send_waits,*recv_waits;
97   int         *nprocs,i,j,n,idx,*procs,nsends,nreceives,nmax,*work;
98   int         tag = 50, *owner,*starts,count;
99   InsertMode  addv;
100   Scalar      *rvalues,*svalues;
101 
102   /* make sure all processors are either in INSERTMODE or ADDMODE */
103   MPI_Allreduce((void *) &aij->insertmode,(void *) &addv,numtids,MPI_INT,
104                 MPI_BOR,comm);
105   if (addv == (AddValues|InsertValues)) {
106     SETERR(1,"Some processors have inserted while others have added");
107   }
108   aij->insertmode = addv; /* in case this processor had no cache */
109 
110   /*  first count number of contributors to each processor */
111   nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs);
112   MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
113   owner = (int *) MALLOC( (aij->stash.n+1)*sizeof(int) ); CHKPTR(owner);
114   for ( i=0; i<aij->stash.n; i++ ) {
115     idx = aij->stash.idx[i];
116     for ( j=0; j<numtids; j++ ) {
117       if (idx >= owners[j] && idx < owners[j+1]) {
118         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
119       }
120     }
121   }
122   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
123 
124   /* inform other processors of number of messages and max length*/
125   work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work);
126   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
127   nreceives = work[mytid];
128   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
129   nmax = work[mytid];
130   FREE(work);
131 
132   /* post receives:
133        1) each message will consist of ordered pairs
134      (global index,value) we store the global index as a double
135      to simply the message passing.
136        2) since we don't know how long each individual message is we
137      allocate the largest needed buffer for each receive. Potentially
138      this is a lot of wasted space.
139 
140 
141        This could be done better.
142   */
143   rvalues = (Scalar *) MALLOC(3*(nreceives+1)*nmax*sizeof(Scalar));
144   CHKPTR(rvalues);
145   recv_waits = (MPI_Request *) MALLOC((nreceives+1)*sizeof(MPI_Request));
146   CHKPTR(recv_waits);
147   for ( i=0; i<nreceives; i++ ) {
148     MPI_Irecv((void *)(rvalues+3*nmax*i),3*nmax,MPI_SCALAR,MPI_ANY_SOURCE,tag,
149               comm,recv_waits+i);
150   }
151 
152   /* do sends:
153       1) starts[i] gives the starting index in svalues for stuff going to
154          the ith processor
155   */
156   svalues = (Scalar *) MALLOC( 3*(aij->stash.n+1)*sizeof(Scalar) );
157   CHKPTR(svalues);
158   send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request));
159   CHKPTR(send_waits);
160   starts = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(starts);
161   starts[0] = 0;
162   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
163   for ( i=0; i<aij->stash.n; i++ ) {
164     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
165     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
166     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
167   }
168   FREE(owner);
169   starts[0] = 0;
170   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
171   count = 0;
172   for ( i=0; i<numtids; i++ ) {
173     if (procs[i]) {
174       MPI_Isend((void*)(svalues+3*starts[i]),3*nprocs[i],MPI_SCALAR,i,tag,
175                 comm,send_waits+count++);
176     }
177   }
178   FREE(starts); FREE(nprocs);
179 
180   /* Free cache space */
181   aij->stash.nmax = aij->stash.n = 0;
182   if (aij->stash.array){ FREE(aij->stash.array); aij->stash.array = 0;}
183 
184   aij->svalues    = svalues;       aij->rvalues = rvalues;
185   aij->nsends     = nsends;         aij->nrecvs = nreceives;
186   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
187   aij->rmax       = nmax;
188 
189   return 0;
190 }
191 extern int MPIAIJSetUpMultiply(Mat);
192 
193 static int MatiAIJEndAssemble(Mat mat)
194 {
195   int        ierr;
196   Matimpiaij *aij = (Matimpiaij *) mat->data;
197 
198   MPI_Status  *send_status,recv_status;
199   int         index,idx,nrecvs = aij->nrecvs, count = nrecvs, i, n;
200   int         row,col;
201   Scalar      *values,val;
202   InsertMode  addv = aij->insertmode;
203 
204   /*  wait on receives */
205   while (count) {
206     MPI_Waitany(nrecvs,aij->recv_waits,&index,&recv_status);
207     /* unpack receives into our local space */
208     values = aij->rvalues + 3*index*aij->rmax;
209     MPI_Get_count(&recv_status,MPI_SCALAR,&n);
210     n = n/3;
211     for ( i=0; i<n; i++ ) {
212       row = (int) PETSCREAL(values[3*i]) - aij->rstart;
213       col = (int) PETSCREAL(values[3*i+1]);
214       val = values[3*i+2];
215       if (col >= aij->cstart && col < aij->cend) {
216           col -= aij->cstart;
217         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
218       }
219       else {
220         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
221       }
222     }
223     count--;
224   }
225   FREE(aij->recv_waits); FREE(aij->rvalues);
226 
227   /* wait on sends */
228   if (aij->nsends) {
229     send_status = (MPI_Status *) MALLOC( aij->nsends*sizeof(MPI_Status) );
230     CHKPTR(send_status);
231     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
232     FREE(send_status);
233   }
234   FREE(aij->send_waits); FREE(aij->svalues);
235 
236   aij->insertmode = NotSetValues;
237   ierr = MatBeginAssembly(aij->A); CHKERR(ierr);
238   ierr = MatEndAssembly(aij->A); CHKERR(ierr);
239 
240   ierr = MPIAIJSetUpMultiply(mat); CHKERR(ierr);
241   ierr = MatBeginAssembly(aij->B); CHKERR(ierr);
242   ierr = MatEndAssembly(aij->B); CHKERR(ierr);
243   return 0;
244 }
245 
246 static int MatiZero(Mat A)
247 {
248   Matimpiaij *l = (Matimpiaij *) A->data;
249 
250   MatZeroEntries(l->A); MatZeroEntries(l->B);
251   return 0;
252 }
253 
254 /* again this uses the same basic stratagy as in the assembly and
255    scatter create routines, we should try to do it systemamatically
256    if we can figure out the proper level of generality. */
257 
258 /* the code does not do the diagonal entries correctly unless the
259    matrix is square and the column and row owerships are identical.
260    This is a BUG. The only way to fix it seems to be to access
261    aij->A and aij->B directly and not through the MatZeroRows()
262    routine.
263 */
264 static int MatiZerorows(Mat A,IS is,Scalar *diag)
265 {
266   Matimpiaij     *l = (Matimpiaij *) A->data;
267   int            i,ierr,N, *rows,*owners = l->rowners,numtids = l->numtids;
268   int            *localrows,*procs,*nprocs,j,found,idx,nsends,*work;
269   int            nmax,*svalues,*starts,*owner,nrecvs,mytid = l->mytid;
270   int            *rvalues,tag = 67,count,base,slen,n,len,*source;
271   int            *lens,index,*lrows,*values;
272   MPI_Comm       comm = l->comm;
273   MPI_Request    *send_waits,*recv_waits;
274   MPI_Status     recv_status,*send_status;
275   IS             istmp;
276 
277   ierr = ISGetLocalSize(is,&N); CHKERR(ierr);
278   ierr = ISGetIndices(is,&rows); CHKERR(ierr);
279 
280   /*  first count number of contributors to each processor */
281   nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs);
282   MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
283   owner = (int *) MALLOC((N+1)*sizeof(int)); CHKPTR(owner); /* see note*/
284   for ( i=0; i<N; i++ ) {
285     idx = rows[i];
286     found = 0;
287     for ( j=0; j<numtids; j++ ) {
288       if (idx >= owners[j] && idx < owners[j+1]) {
289         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
290       }
291     }
292     if (!found) SETERR(1,"Index out of range");
293   }
294   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
295 
296   /* inform other processors of number of messages and max length*/
297   work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work);
298   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
299   nrecvs = work[mytid];
300   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
301   nmax = work[mytid];
302   FREE(work);
303 
304   /* post receives:   */
305   rvalues = (int *) MALLOC((nrecvs+1)*nmax*sizeof(int)); /*see note */
306   CHKPTR(rvalues);
307   recv_waits = (MPI_Request *) MALLOC((nrecvs+1)*sizeof(MPI_Request));
308   CHKPTR(recv_waits);
309   for ( i=0; i<nrecvs; i++ ) {
310     MPI_Irecv((void *)(rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,
311               comm,recv_waits+i);
312   }
313 
314   /* do sends:
315       1) starts[i] gives the starting index in svalues for stuff going to
316          the ith processor
317   */
318   svalues = (int *) MALLOC( (N+1)*sizeof(int) ); CHKPTR(svalues);
319   send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request));
320   CHKPTR(send_waits);
321   starts = (int *) MALLOC( (numtids+1)*sizeof(int) ); CHKPTR(starts);
322   starts[0] = 0;
323   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
324   for ( i=0; i<N; i++ ) {
325     svalues[starts[owner[i]]++] = rows[i];
326   }
327   ISRestoreIndices(is,&rows);
328 
329   starts[0] = 0;
330   for ( i=1; i<numtids+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
331   count = 0;
332   for ( i=0; i<numtids; i++ ) {
333     if (procs[i]) {
334       MPI_Isend((void*)(svalues+starts[i]),nprocs[i],MPI_INT,i,tag,
335                 comm,send_waits+count++);
336     }
337   }
338   FREE(starts);
339 
340   base = owners[mytid];
341 
342   /*  wait on receives */
343   lens = (int *) MALLOC( 2*(nrecvs+1)*sizeof(int) ); CHKPTR(lens);
344   source = lens + nrecvs;
345   count = nrecvs; slen = 0;
346   while (count) {
347     MPI_Waitany(nrecvs,recv_waits,&index,&recv_status);
348     /* unpack receives into our local space */
349     MPI_Get_count(&recv_status,MPI_INT,&n);
350     source[index]  = recv_status.MPI_SOURCE;
351     lens[index]  = n;
352     slen += n;
353     count--;
354   }
355   FREE(recv_waits);
356 
357   /* move the data into the send scatter */
358   lrows = (int *) MALLOC( slen*sizeof(int) ); CHKPTR(lrows);
359   count = 0;
360   for ( i=0; i<nrecvs; i++ ) {
361     values = rvalues + i*nmax;
362     for ( j=0; j<lens[i]; j++ ) {
363       lrows[count++] = values[j] - base;
364     }
365   }
366   FREE(rvalues); FREE(lens);
367   FREE(owner); FREE(nprocs);
368 
369   /* actually zap the local rows */
370   ierr = ISCreateSequential(slen,lrows,&istmp); CHKERR(ierr);  FREE(lrows);
371   ierr = MatZeroRows(l->A,istmp,diag); CHKERR(ierr);
372   ierr = MatZeroRows(l->B,istmp,0); CHKERR(ierr);
373   ierr = ISDestroy(istmp); CHKERR(ierr);
374 
375   /* wait on sends */
376   if (nsends) {
377     send_status = (MPI_Status *) MALLOC( nsends*sizeof(MPI_Status) );
378     CHKPTR(send_status);
379     MPI_Waitall(nsends,send_waits,send_status);
380     FREE(send_status);
381   }
382   FREE(send_waits); FREE(svalues);
383 
384 
385   return 0;
386 }
387 
388 static int MatiAIJMult(Mat aijin,Vec xx,Vec yy)
389 {
390   Matimpiaij *aij = (Matimpiaij *) aijin->data;
391   int        ierr;
392 
393   ierr = VecScatterBegin(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx);
394   CHKERR(ierr);
395   ierr = MatMult(aij->A,xx,yy); CHKERR(ierr);
396   ierr = VecScatterEnd(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx); CHKERR(ierr);
397   ierr = MatMultAdd(aij->B,aij->lvec,yy,yy); CHKERR(ierr);
398   return 0;
399 }
400 
401 /*
402   This only works correctly for square matrices where the subblock A->A is the
403    diagonal block
404 */
405 static int MatiAIJgetdiag(Mat Ain,Vec v)
406 {
407   Matimpiaij *A = (Matimpiaij *) Ain->data;
408   return MatGetDiagonal(A->A,v);
409 }
410 
411 static int MatiAIJdestroy(PetscObject obj)
412 {
413   Mat        mat = (Mat) obj;
414   Matimpiaij *aij = (Matimpiaij *) mat->data;
415   int        ierr;
416   FREE(aij->rowners);
417   ierr = MatDestroy(aij->A); CHKERR(ierr);
418   ierr = MatDestroy(aij->B); CHKERR(ierr);
419   FREE(aij); FREE(mat);
420   if (aij->lvec) VecDestroy(aij->lvec);
421   if (aij->Mvctx) VecScatterCtxDestroy(aij->Mvctx);
422   return 0;
423 }
424 
425 static int MatiView(PetscObject obj,Viewer viewer)
426 {
427   Mat        mat = (Mat) obj;
428   Matimpiaij *aij = (Matimpiaij *) mat->data;
429   int        ierr;
430 
431   MPE_Seq_begin(aij->comm,1);
432   printf("[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
433           aij->mytid,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
434           aij->cend);
435   ierr = MatView(aij->A,viewer); CHKERR(ierr);
436   ierr = MatView(aij->B,viewer); CHKERR(ierr);
437   MPE_Seq_end(aij->comm,1);
438   return 0;
439 }
440 
441 /*
442     This has to provide several versions.
443 
444      1) per sequential
445      2) a) use only local smoothing updating outer values only once.
446         b) local smoothing updating outer values each inner iteration
447      3) color updating out values betwen colors. (this imples an
448         ordering that is sort of related to the IS argument, it
449         is not clear a IS argument makes the most sense perhaps it
450         should be dropped.
451 */
452 static int MatiAIJrelax(Mat matin,Vec bb,double omega,int flag,IS is,
453                         int its,Vec xx)
454 {
455   Matimpiaij *mat = (Matimpiaij *) matin->data;
456   Scalar     zero = 0.0;
457   int        ierr,one = 1, tmp, *idx, *diag;
458   int        n = mat->n, m = mat->m, i, j;
459 
460   if (is) SETERR(1,"No support for ordering in relaxation");
461   if (flag & SOR_ZERO_INITIAL_GUESS) {
462     if (ierr = VecSet(&zero,xx)) return ierr;
463   }
464 
465   /* update outer values from other processors*/
466 
467   /* smooth locally */
468   return 0;
469 }
470 /* -------------------------------------------------------------------*/
471 static struct _MatOps MatOps = {MatiAIJInsertValues,
472        0, 0,
473        MatiAIJMult,0,0,0,
474        0,0,0,0,
475        0,0,
476        MatiAIJrelax,
477        0,
478        0,0,0,
479        0,
480        MatiAIJgetdiag,0,0,
481        MatiAIJBeginAssemble,MatiAIJEndAssemble,
482        0,
483        0,MatiZero,MatiZerorows,0,
484        0,0,0,0 };
485 
486 
487 
488 /*@
489 
490       MatCreateMPIAIJ - Creates a sparse parallel matrix
491                                  in AIJ format.
492 
493   Input Parameters:
494 .   comm - MPI communicator
495 .   m,n - number of local rows and columns (or -1 to have calculated)
496 .   M,N - global rows and columns (or -1 to have calculated)
497 .   d_nz - total number nonzeros in diagonal portion of matrix
498 .   d_nzz - number of nonzeros per row in diagonal portion of matrix or null
499 .           You must leave room for the diagonal entry even if it is zero.
500 .   o_nz - total number nonzeros in off-diagonal portion of matrix
501 .   o_nzz - number of nonzeros per row in off-diagonal portion of matrix
502 .           or null. You must have at least one nonzero per row.
503 
504   Output parameters:
505 .  newmat - the matrix
506 
507   Keywords: matrix, aij, compressed row, sparse, parallel
508 @*/
509 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
510                  int d_nz,int *d_nnz, int o_nz,int *o_nnz,Mat *newmat)
511 {
512   Mat          mat;
513   Matimpiaij   *aij;
514   int          ierr, i,rl,len,sum[2],work[2];
515   *newmat         = 0;
516   CREATEHEADER(mat,_Mat);
517   mat->data       = (void *) (aij = NEW(Matimpiaij)); CHKPTR(aij);
518   mat->cookie     = MAT_COOKIE;
519   mat->ops        = &MatOps;
520   mat->destroy    = MatiAIJdestroy;
521   mat->view       = MatiView;
522   mat->type       = MATAIJMPI;
523   mat->factor     = 0;
524   mat->row        = 0;
525   mat->col        = 0;
526   aij->comm       = comm;
527   aij->insertmode = NotSetValues;
528   MPI_Comm_rank(comm,&aij->mytid);
529   MPI_Comm_size(comm,&aij->numtids);
530 
531   if (M == -1 || N == -1) {
532     work[0] = m; work[1] = n;
533     MPI_Allreduce((void *) work,(void *) sum,1,MPI_INT,MPI_SUM,comm );
534     if (M == -1) M = sum[0];
535     if (N == -1) N = sum[1];
536   }
537   if (m == -1) {m = M/aij->numtids + ((M % aij->numtids) > aij->mytid);}
538   if (n == -1) {n = N/aij->numtids + ((N % aij->numtids) > aij->mytid);}
539   aij->m       = m;
540   aij->n       = n;
541   aij->N       = N;
542   aij->M       = M;
543 
544   /* build local table of row and column ownerships */
545   aij->rowners = (int *) MALLOC(2*(aij->numtids+2)*sizeof(int));
546   CHKPTR(aij->rowners);
547   aij->cowners = aij->rowners + aij->numtids + 1;
548   MPI_Allgather(&m,1,MPI_INT,aij->rowners+1,1,MPI_INT,comm);
549   aij->rowners[0] = 0;
550   for ( i=2; i<=aij->numtids; i++ ) {
551     aij->rowners[i] += aij->rowners[i-1];
552   }
553   aij->rstart = aij->rowners[aij->mytid];
554   aij->rend   = aij->rowners[aij->mytid+1];
555   MPI_Allgather(&n,1,MPI_INT,aij->cowners+1,1,MPI_INT,comm);
556   aij->cowners[0] = 0;
557   for ( i=2; i<=aij->numtids; i++ ) {
558     aij->cowners[i] += aij->cowners[i-1];
559   }
560   aij->cstart = aij->cowners[aij->mytid];
561   aij->cend   = aij->cowners[aij->mytid+1];
562 
563 
564   ierr = MatCreateSequentialAIJ(m,n,d_nz,d_nnz,&aij->A); CHKERR(ierr);
565   ierr = MatCreateSequentialAIJ(m,N,o_nz,o_nnz,&aij->B); CHKERR(ierr);
566 
567   /* build cache for off array entries formed */
568   aij->stash.nmax = CHUNCKSIZE; /* completely arbratray number */
569   aij->stash.n    = 0;
570   aij->stash.array = (Scalar *) MALLOC( aij->stash.nmax*(2*sizeof(int) +
571                             sizeof(Scalar))); CHKPTR(aij->stash.array);
572   aij->stash.idx = (int *) (aij->stash.array + aij->stash.nmax);
573   aij->stash.idy = (int *) (aij->stash.idx + aij->stash.nmax);
574 
575   /* stuff used for matrix vector multiply */
576   aij->lvec      = 0;
577   aij->Mvctx     = 0;
578 
579   *newmat = mat;
580   return 0;
581 }
582