xref: /phasta/phSolver/compressible/solgmrpetsc.c (revision d06028c1ed09954bf598b0145fe1ac8aecabad11) !
1 #ifdef HAVE_PETSC
2 #include <stdio.h>
3 
4 #include "petscsys.h"
5 #include "petscvec.h"
6 #include "petscmat.h"
7 #include "petscpc.h"
8 #include "petscksp.h"
9 
10 #include "assert.h"
11 #include "common_c.h"
12 
13 #include <FCMangle.h>
14 #define SolGMRp FortranCInterface_GLOBAL_(solgmrp,SOLGMRP)
15 #define SolGMRpSclr FortranCInterface_GLOBAL_(solgmrpsclr,SOLGMRPSCLR)
16 #define ElmGMRPETSc FortranCInterface_GLOBAL_(elmgmrpetsc, ELMGMRPETSC)
17 #define ElmGMRPETScSclr FortranCInterface_GLOBAL_(elmgmrpetscsclr, ELMGMRPETSCSCLR)
18 #define rstatp FortranCInterface_GLOBAL_(rstatp, RSTATP)
19 #define rstatpSclr FortranCInterface_GLOBAL_(rstatpsclr, RSTATPSCLR)
20 #define min(a,b) (((a) < (b)) ? (a) : (b))
21 #define max(a,b) (((a) > (b)) ? (a) : (b))
22 #define get_time FortranCInterface_GLOBAL_(get_time,GET_TIME)
23 #define get_max_time_diff FortranCInterface_GLOBAL_(get_max_time_diff,GET_MAX_TIME_DIFF)
24 
25 typedef long long int gcorp_t;
26 
27 void get_time(uint64_t* rv, uint64_t* cycle);
28 void get_max_time_diff(uint64_t* first, uint64_t* last, uint64_t* c_first, uint64_t* c_last, char* lbl);
29 
30 //#include "auxmpi.h"
31 //      static PetscOffset poff;
32 
33       static Mat lhsP;
34       static PC pc;
35       static KSP ksp;
36       static Vec DyP, resP, DyPLocal;
37       static PetscErrorCode ierr;
38       static PetscInt PetscOne, PetscRow, PetscCol, LocalRow, LocalCol;
39       static IS LocalIndexSet;
40       static ISLocalToGlobalMapping VectorMapping;
41       static  VecScatter scatter7;
42       static int firstpetsccall = 1;
43       static  PetscInt maxitsHist;
44       static  PetscReal* resHist;
45 
46       static Mat lhsPs;
47       static PC pcs;
48       static KSP ksps;
49       static Vec DyPs, resPs, DyPLocals;
50       static IS LocalIndexSets;
51       static ISLocalToGlobalMapping VectorMappings;
52       static VecScatter scatter7s;
53       static int firstpetsccalls = 1;
54 
55       static int rankdump=-1;  // 8121 was the problem rank with 3.5.3
56       PetscReal resNrm;
57 
58 
SolGMRp(double * y,double * ac,double * yold,double * x,int * iBC,double * BC,int * col,int * row,double * lhsk,double * res,double * BDiag,int * iper,int * ilwork,double * shp,double * shgl,double * shpb,double * shglb,double * Dy,double * rerr,long long int * fncorp)59 void     SolGMRp(double* y,         double* ac,        double* yold,
60      	double* x,         int* iBC,       double* BC,
61      	int* col,       int* row,       double* lhsk,
62      	double* res,       double* BDiag,
63      	int* iper,
64      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
65      	double* shglb,     double* Dy,        double* rerr, long long int* fncorp)
66 {
67 //
68 // ----------------------------------------------------------------------
69 //
70 //  This is the preconditioned GMRES driver routine.
71 //
72 // input:
73 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
74 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
75 //  yold   (nshg,ndof)           : Y-variables at beginning of step
76 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
77 //  x      (numnp,nsd)            : node coordinates
78 //  iBC    (nshg)                : BC codes
79 //  BC     (nshg,ndofBC)         : BC constraint parameters
80 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
81 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
82 //
83 // output:
84 //  res    (nshg,nflow)           : preconditioned residual
85 //  BDiag  (nshg,nflow,nflow)      : block-diagonal preconditioner
86 //
87 //
88 // Zdenek Johan,  Winter 1991.  (Fortran 90)
89 // ----------------------------------------------------------------------
90 //
91 
92 
93 // Get variables from common_c.h
94       int nshg, nflow, nsd, iownnodes;
95       nshg  = conpar.nshg;
96       nflow = conpar.nflow;
97       nsd = NSD;
98       iownnodes = conpar.iownnodes;
99 
100       gcorp_t nshgt;
101       gcorp_t mbeg;
102       gcorp_t mend;
103       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
104       mbeg = (gcorp_t) newdim.minowned;
105       mend = (gcorp_t) newdim.maxowned;
106 
107 
108       int node, element, var, eqn;
109       double valtoinsert;
110       int nenl, iel, lelCat, lcsyst, iorder, nshl;
111       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
112       double DyFlat[nshg*nflow];
113       double DyFlatPhasta[nshg*nflow];
114       double rmes[nshg*nflow];
115 // DEBUG
116       int i,j,k,l,m;
117 
118       // FIXME: PetscScalar
119       double  real_rtol, real_abstol, real_dtol;
120 // /DEBUG
121       //double parray[1]; // Should be a PetscScalar
122       double *parray; // Should be a PetscScalar
123       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
124       PetscInt petsc_n;
125       PetscOne = 1;
126       uint64_t duration[4];
127 
128       gcorp_t glbNZ;
129 
130       if(firstpetsccall == 1) {
131 //Everthing in this conditional block should be moved to a function to estimate the size of PETSc's matrix which improves time on the first matrix fill
132 //
133         PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
134         PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
135         for(i=0;i<iownnodes;i++) {
136              idiagnz[i]=0;
137              iodiagnz[i]=0;
138         }
139         i=0;
140         for(k=0;k<nshg;k++) {
141           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
142 // this node is not owned by this rank so we skip
143           } else {
144            for(j=col[i]-1;j<col[i+1]-1;j++) {
145 //          assert(row[j]<=nshg);
146 //          assert(fncorp[row[j]-1]<=nshgt);
147              glbNZ=fncorp[row[j]-1];
148              if((glbNZ < mbeg) || (glbNZ > mend)) {
149                 iodiagnz[i]++;
150              } else {
151                 idiagnz[i]++;
152              }
153            }
154            i++;
155           }
156         }
157         gcorp_t mind=1000;
158         gcorp_t mino=1000;
159         gcorp_t maxd=0;
160         gcorp_t maxo=0;
161         for(i=0;i<iownnodes;i++) {
162            mind=min(mind,idiagnz[i]);
163            mino=min(mino,iodiagnz[i]);
164            maxd=max(maxd,idiagnz[i]);
165            maxo=max(maxo,iodiagnz[i]);
166 //           iodiagnz[i]=max(iodiagnz[i],10);
167 //           idiagnz[i]=max(idiagnz[i],10);
168 //           iodiagnz[i]=2*iodiagnz[i];   //  estimate a bit higher for off-part interactions
169 //           idiagnz[i]=2*idiagnz[i];   //  estimate a bit higher for off-part interactions
170         }
171 // the above was pretty good but below is faster and not too much more memory...of course once you do this
172 // could just use the constant fill parameters in create but keep it alive for potential later optimization
173 
174         for(i=0;i<iownnodes;i++) {
175            iodiagnz[i]=1.3*maxd;
176            idiagnz[i]=1.3*maxd;
177         }
178 
179 
180 
181         if(workfc.numpe < 200){
182           printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
183           printf("myrank,mind,maxd,mino,maxo %d %d %d %d %d \n",workfc.myrank,mind,maxd,mino,maxo);
184         }
185         // Print debug info
186         if(nshgt < 200){
187           int irank;
188           for(irank=0;irank<workfc.numpe;irank++) {
189             if(irank == workfc.myrank){
190               printf("mbeg,mend,myrank,idiagnz, iodiagnz %d %d %d \n",mbeg,mend,workfc.myrank);
191               for(i=0;i<iownnodes;i++) {
192                 printf("%d %ld %ld \n",workfc.myrank,idiagnz[i],iodiagnz[i]);
193               }
194             }
195            MPI_Barrier(MPI_COMM_WORLD);
196           }
197         }
198         petsc_bs = (PetscInt) nflow;
199         petsc_m  = (PetscInt) nflow* (PetscInt) iownnodes;
200         petsc_M  = (PetscInt) nshgt * (PetscInt) nflow;
201         petsc_PA  = (PetscInt) 40;
202         ierr = MatCreateBAIJ(PETSC_COMM_WORLD, petsc_bs, petsc_m, petsc_m, petsc_M, petsc_M,
203                              0, idiagnz, 0, iodiagnz, &lhsP);
204 
205         ierr = MatSetOption(lhsP, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
206 
207 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
208 #ifdef JEDBROWN
209         ierr = MatSetOption(lhsP, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
210 #endif
211         ierr = MatSetUp(lhsP);
212 
213       PetscInt myMatStart, myMatEnd;
214       ierr = MatGetOwnershipRange(lhsP, &myMatStart, &myMatEnd);
215 //debug
216       if(workfc.myrank == rankdump) printf("Flow myrank,myMatStart,myMatEnd %d,%d,%d, \n", workfc.myrank,myMatStart,myMatEnd);
217       }
218       if(genpar.lhs ==1) ierr = MatZeroEntries(lhsP);
219 
220 //
221 // .... *******************>> Element Data Formation <<******************
222 //
223 //
224 // .... set the parameters for flux and surface tension calculations
225 //
226 //
227       genpar.idflx = 0 ;
228       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
229       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
230 
231       get_time(duration, (duration+1));
232 
233       ElmGMRPETSc(y,             ac,            x,
234                   shp,           shgl,          iBC,
235                   BC,            shpb,
236                   shglb,         res,
237                   rmes,                   iper,
238                   ilwork,        rerr ,         &lhsP);
239 
240       get_time((duration+2), (duration+3));
241       get_max_time_diff((duration), (duration+2),
242                         (duration+1), (duration+3),
243                         "ElmGMRPETSc \0");  // char(0))
244        if(firstpetsccall == 1) {
245       // Setup IndexSet. For now, we mimic vector insertion procedure
246       // Since we always reference by global indexes this doesn't matter
247       // except for cache performance)
248       // TODO: Better arrangment?
249       PetscInt* indexsetary = malloc(sizeof(PetscInt)*nflow*nshg);
250       PetscInt nodetoinsert;
251         nodetoinsert = 0;
252         k=0;
253 //debug
254          if(workfc.myrank == rankdump) {
255              printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
256              printf("myrank,mbeg,mend %d %d %d \n",workfc.myrank,mbeg,mend);
257            }
258         if(workfc.numpe > 1) {
259           for (i=0; i<nshg ; i++) {
260             nodetoinsert = fncorp[i]-1;
261 //debug
262          if(workfc.myrank == rankdump) {
263              printf("myrank,i,nodetoinsert %d %d %d \n",workfc.myrank,i,nodetoinsert);
264          }
265 
266 //            assert(fncorp[i]>=0);
267             for (j=1; j<=nflow; j++) {
268               indexsetary[k] = nodetoinsert*nflow+(j-1);
269               assert(indexsetary[k]>=0);
270 //              assert(fncorp[i]>=0);
271               k = k+1;
272             }
273           }
274         }
275         else {
276           for (i=0; i<nshg ; i++) {
277             nodetoinsert = i;
278             for (j=1; j<=nflow; j++) {
279               indexsetary[k] = nodetoinsert*nflow+(j-1);
280               k = k+1;
281             }
282           }
283         }
284 
285 //  Create Vector Index Maps
286         petsc_n  = (PetscInt) nshg * (PetscInt) nflow;
287         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetary,
288      	       PETSC_COPY_VALUES, &LocalIndexSet);
289       free(indexsetary);
290       }
291       if(genpar.lhs == 1) {
292       get_time((duration), (duration+1));
293       ierr = MatAssemblyBegin(lhsP, MAT_FINAL_ASSEMBLY);
294       ierr = MatAssemblyEnd(lhsP, MAT_FINAL_ASSEMBLY);
295       get_time((duration+2),(duration+3));
296       get_max_time_diff((duration), (duration+2),
297                         (duration+1), (duration+3),
298                         "MatAssembly \0"); // char(0))
299       get_time(duration, (duration+1));
300       }
301       if(firstpetsccall == 1) {
302         ierr = MatGetLocalSize(lhsP, &LocalRow, &LocalCol);
303         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resP);
304         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyP);
305       }
306       ierr = VecZeroEntries(resP);
307       if(firstpetsccall == 1) {
308         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocal);
309       }
310 
311       PetscRow=0;
312       k = 0;
313       int index;
314       for (i=0; i<nshg; i++ ){
315         for (j = 1; j<=nflow; j++){
316           index = i + (j-1)*nshg;
317           valtoinsert = res[index];
318           if(workfc.numpe > 1) {
319             PetscRow = (fncorp[i]-1)*nflow+(j-1);
320           }
321           else {
322             PetscRow = i*nflow+(j-1);
323           }
324           assert(fncorp[i]<=nshgt);
325           assert(fncorp[i]>0);
326           assert(PetscRow>=0);
327           assert(PetscRow<=nshgt*nflow);
328           ierr =  VecSetValue(resP, PetscRow, valtoinsert, ADD_VALUES);
329         }
330       }
331       ierr = VecAssemblyBegin(resP);
332       ierr = VecAssemblyEnd(resP);
333       ierr = VecNorm(resP,NORM_2,&resNrm);
334       get_time((duration+2), (duration+3));
335       get_max_time_diff((duration), (duration+2),
336                         (duration+1), (duration+3),
337                         "VectorWorkPre \0"); // char(0))
338 
339       get_time((duration),(duration+1));
340 
341       if(firstpetsccall == 1) {
342         ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
343         ierr = KSPSetOperators(ksp, lhsP, lhsP);
344         ierr = KSPGetPC(ksp, &pc);
345         ierr = PCSetType(pc, PCPBJACOBI);
346         PetscInt maxits;
347         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
348         ierr = KSPSetTolerances(ksp, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
349         ierr = KSPSetFromOptions(ksp);
350         maxitsHist=1000;
351         resHist= (PetscReal*) malloc(sizeof(PetscReal)*maxitsHist);
352         ierr = KSPSetResidualHistory(ksp,resHist,maxitsHist, PETSC_TRUE);
353       }
354       ierr = KSPSolve(ksp, resP, DyP);
355       ierr = KSPGetResidualHistory(ksp,&resHist,&maxitsHist);
356       PetscReal resNrmP=resHist[0];
357       get_time((duration+2),(duration+3));
358       get_max_time_diff((duration), (duration+2),
359                         (duration+1), (duration+3),
360                         "KSPSolve \0"); // char(0))
361       get_time((duration),(duration+1));
362       if(firstpetsccall == 1) {
363       ierr = VecScatterCreate(DyP, LocalIndexSet, DyPLocal, NULL, &scatter7);
364       }
365       ierr = VecScatterBegin(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
366       ierr = VecScatterEnd(scatter7, DyP, DyPLocal, INSERT_VALUES, SCATTER_FORWARD);
367       ierr = VecGetArray(DyPLocal, &parray);
368       PetscRow = 0;
369       for ( node = 0; node< nshg; node++) {
370         for (var = 1; var<= nflow; var++) {
371           index = node + (var-1)*nshg;
372           Dy[index] = parray[PetscRow];
373           PetscRow = PetscRow+1;
374         }
375       }
376       ierr = VecRestoreArray(DyPLocal, &parray);
377 
378       firstpetsccall = 0;
379 
380 // .... output the statistics
381 //
382       itrpar.iKs=0; // see rstat()
383       PetscInt its;
384       ierr = KSPGetIterationNumber(ksp, &its);
385       itrpar.iKs = (int) its;
386       /*
387       PetscReal scale=1.0/sqrt(1.0*nshgt);
388       if(workfc.myrank ==0) {
389         printf("node resNrmP rosqrtNshgt\n");
390         for ( node = 0; node<its; node++) {
391           printf(" %d %f %f \n",node,resHist[node],scale*resHist[node]);
392         }
393       }
394 */
395       get_time((duration+2),(duration+3));
396       get_max_time_diff((duration), (duration+2),
397                         (duration+1), (duration+3),
398                         "solWork \0"); // char(0))
399       itrpar.ntotGM += (int) its;
400       rstatp (&resNrm,&resNrmP);
401 //
402 // .... end
403 //
404 }
405 
SolGMRpSclr(double * y,double * ac,double * x,double * elDw,int * iBC,double * BC,int * col,int * row,int * iper,int * ilwork,double * shp,double * shgl,double * shpb,double * shglb,double * res,double * Dy,long long int * fncorp)406 void     SolGMRpSclr(double* y,         double* ac,
407      	double* x,      double* elDw,   int* iBC,       double* BC,
408      	int* col,       int* row,
409      	int* iper,
410      	int* ilwork,    double* shp,       double* shgl,      double* shpb,
411      	double* shglb,  double* res,   double* Dy,     long long int* fncorp)
412 {
413 //
414 // ----------------------------------------------------------------------
415 //
416 //  This is the preconditioned GMRES driver routine.
417 //
418 // input:
419 //  y      (nshg,ndof)           : Y-variables at n+alpha_v
420 //  ac     (nshg,ndof)           : Primvar. accel. variable n+alpha_m
421 //  yold   (nshg,ndof)           : Y-variables at beginning of step
422 //  acold  (nshg,ndof)           : Primvar. accel. variable at begng step
423 //  x      (numnp,nsd)            : node coordinates
424 //  iBC    (nshg)                : BC codes
425 //  BC     (nshg,ndofBC)         : BC constraint parameters
426 //  shp(b) (nen,maxsh,melCat)     : element shape functions (boundary)
427 //  shgl(b)(nsd,nen,maxsh,melCat) : local gradients of shape functions
428 //
429 // ----------------------------------------------------------------------
430 //
431 
432 
433 // Get variables from common_c.h
434       int nshg, nflow, nsd, iownnodes;
435       nshg  = conpar.nshg;
436       nsd = NSD;
437       iownnodes = conpar.iownnodes;
438 
439       gcorp_t nshgt;
440       gcorp_t mbeg;
441       gcorp_t mend;
442       nshgt = (gcorp_t) newdim.nshgt; //Fix nshgt in common_c.h
443       mbeg = (gcorp_t) newdim.minowned;
444       mend = (gcorp_t) newdim.maxowned;
445 
446 
447       int node, element, var, eqn;
448       double valtoinsert;
449       int nenl, iel, lelCat, lcsyst, iorder, nshl;
450       int mattyp, ndofl, nsymdl, npro, ngauss, nppro;
451       double DyFlats[nshg];
452       double DyFlatPhastas[nshg];
453       double rmes[nshg];
454 // DEBUG
455       int i,j,k,l,m;
456 
457       double  real_rtol, real_abstol, real_dtol;
458       double *parray;
459       PetscInt petsc_bs, petsc_m, petsc_M,petsc_PA;
460       PetscInt petsc_n;
461       PetscOne = 1;
462       uint64_t duration[4];
463 
464 
465 //
466 //
467 // .... *******************>> Element Data Formation <<******************
468 //
469 //
470 // .... set the parameters for flux and surface tension calculations
471 //
472 //
473       genpar.idflx = 0 ;
474       if(genpar.idiff >= 1)  genpar.idflx= genpar.idflx + (nflow-1) * nsd;
475       if (genpar.isurf == 1) genpar.idflx=genpar.idflx + nsd;
476 
477 
478 
479       gcorp_t glbNZ;
480 
481       if(firstpetsccalls == 1) {
482         PetscInt* idiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
483         PetscInt* iodiagnz= (PetscInt*) malloc(sizeof(PetscInt)*iownnodes);
484         for(i=0;i<iownnodes;i++) {
485              idiagnz[i]=0;
486              iodiagnz[i]=0;
487         }
488         i=0;
489         for(k=0;k<nshg;k++) {
490           if((fncorp[k] < mbeg) || (fncorp[k] >mend)){
491 // this node is not owned by this rank so we skip
492           } else {
493            for(j=col[i]-1;j<col[i+1]-1;j++) {
494              glbNZ=fncorp[row[j]-1];
495              if((glbNZ < mbeg) || (glbNZ > mend)) {
496                 iodiagnz[i]++;
497              } else {
498                 idiagnz[i]++;
499              }
500            }
501            i++;
502           }
503         }
504         gcorp_t mind=1000;
505         gcorp_t mino=1000;
506         gcorp_t maxd=0;
507         gcorp_t maxo=0;
508         for(i=0;i<iownnodes;i++) {
509            mind=min(mind,idiagnz[i]);
510            mino=min(mino,iodiagnz[i]);
511            maxd=max(maxd,idiagnz[i]);
512            maxo=max(maxo,iodiagnz[i]);
513         }
514 
515         for(i=0;i<iownnodes;i++) {
516            iodiagnz[i]=1.3*maxd;
517            idiagnz[i]=1.3*maxd;
518         }
519 
520 
521 
522 //       }
523 //       if(firstpetsccalls == 1) {
524 
525         petsc_m  = (PetscInt) iownnodes;
526         petsc_M  = (PetscInt) nshgt;
527         petsc_PA  = (PetscInt) 40;
528 
529         ierr = MatCreateAIJ(PETSC_COMM_WORLD, petsc_m, petsc_m, petsc_M, petsc_M,
530                             0, idiagnz, 0, iodiagnz, &lhsPs);
531 
532         ierr = MatSetOption(lhsPs, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE);
533 
534 // Next is Jed Brown's improvement to imprint Assembly to make that stage scalable after the first call
535 #ifdef HIDEJEDBROWN
536         ierr = MatSetOption(lhsPs, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE);
537 #endif
538         ierr = MatSetUp(lhsPs);
539 
540       PetscInt myMatStart, myMatEnd;
541       ierr = MatGetOwnershipRange(lhsPs, &myMatStart, &myMatEnd);
542       if(workfc.myrank ==rankdump) printf("Sclr myrank,myMatStart,myMatEnd %d,%d,%d, \n", workfc.myrank,myMatStart,myMatEnd);
543       }
544 //      MPI_Barrier(MPI_COMM_WORLD);
545  //     if(workfc.myrank ==0) printf("Before MatZeroEntries  \n");
546       if(genpar.lhs == 1) ierr = MatZeroEntries(lhsPs);
547 
548       get_time(duration, (duration+1));
549 
550 //      MPI_Barrier(MPI_COMM_WORLD);
551  //     if(workfc.myrank ==0) printf("Before elmgmr  \n");
552 //      for (i=0; i<nshg ; i++) {
553 //            assert(fncorp[i]>0);
554 //      }
555 
556       ElmGMRPETScSclr(y,             ac,            x, elDw,
557                   shp,           shgl,          iBC,
558                   BC,            shpb,
559                   shglb,         res,
560                   rmes,                   iper,
561                   ilwork,        &lhsPs);
562        if(firstpetsccalls == 1) {
563       // Setup IndexSet. For now, we mimic vector insertion procedure
564       // Since we always reference by global indexes this doesn't matter
565       // except for cache performance)
566       // TODO: Better arrangment?
567 
568       PetscInt* indexsetarys = malloc(sizeof(PetscInt)*nshg);
569       PetscInt nodetoinsert;
570 
571         nodetoinsert = 0;
572         k=0;
573 
574 //debug
575          if(workfc.myrank == rankdump) {
576              printf("myrank,i,iownnodes,nshg %d %d %d %d \n",workfc.myrank,i,iownnodes,nshg);
577              printf("myrank,mbeg,mend %d %d %d \n",workfc.myrank,mbeg,mend);
578            }
579 
580         if(workfc.numpe > 1) {
581           for (i=0; i<nshg ; i++) {
582             nodetoinsert = fncorp[i]-1;
583 //debug
584          if(workfc.myrank == rankdump) {
585              printf("myrank,i,nodetoinsert %d %d %d \n",workfc.myrank,i,nodetoinsert);
586          }
587 
588             indexsetarys[k] = nodetoinsert;
589             k = k+1;
590           }
591         }
592         else {
593           for (i=0; i<nshg ; i++) {
594             nodetoinsert = i;
595             indexsetarys[k] = nodetoinsert;
596             k = k+1;
597           }
598         }
599 
600 //  Create Vector Index Maps
601         petsc_n  = (PetscInt) nshg;
602         ierr = ISCreateGeneral(PETSC_COMM_SELF, petsc_n, indexsetarys,
603      	       PETSC_COPY_VALUES, &LocalIndexSets);
604       free(indexsetarys);
605       }
606       if(genpar.lhs ==1) {
607       ierr = MatAssemblyBegin(lhsPs, MAT_FINAL_ASSEMBLY);
608       ierr = MatAssemblyEnd(lhsPs, MAT_FINAL_ASSEMBLY);
609       }
610       if(firstpetsccalls == 1) {
611         ierr = MatGetLocalSize(lhsPs, &LocalRow, &LocalCol);
612         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &resPs);
613         ierr = VecCreateMPI(PETSC_COMM_WORLD, LocalRow, petsc_M, &DyPs);
614       }
615       ierr = VecZeroEntries(resPs);
616       if(firstpetsccalls == 1) {
617         ierr = VecCreateSeq(PETSC_COMM_SELF, petsc_n, &DyPLocals);
618       }
619 
620       PetscRow=0;
621       k = 0;
622       int index;
623       for (i=0; i<nshg; i++ ){
624           valtoinsert = res[i];
625           if(workfc.numpe > 1) {
626             PetscRow = (fncorp[i]-1);
627           }
628           else {
629             PetscRow = i;
630           }
631           assert(fncorp[i]<=nshgt);
632           assert(fncorp[i]>0);
633           assert(PetscRow>=0);
634           assert(PetscRow<=nshgt);
635           ierr =  VecSetValue(resPs, PetscRow, valtoinsert, ADD_VALUES);
636       }
637       ierr = VecAssemblyBegin(resPs);
638       ierr = VecAssemblyEnd(resPs);
639       ierr = VecNorm(resPs,NORM_2,&resNrm);
640 
641       if(firstpetsccalls == 1) {
642         ierr = KSPCreate(PETSC_COMM_WORLD, &ksps);
643         ierr = KSPSetOperators(ksps, lhsPs, lhsPs);
644         ierr = KSPGetPC(ksps, &pcs);
645         ierr = PCSetType(pcs, PCPBJACOBI);
646         PetscInt maxits;
647         maxits = (PetscInt)  solpar.nGMRES * (PetscInt) solpar.Kspace;
648         ierr = KSPSetTolerances(ksps, timdat.etol, PETSC_DEFAULT, PETSC_DEFAULT, maxits);
649         ierr = KSPSetFromOptions(ksps);
650       }
651       ierr = KSPSolve(ksps, resPs, DyPs);
652       if(firstpetsccalls == 1) {
653       ierr = VecScatterCreate(DyPs, LocalIndexSets, DyPLocals, NULL, &scatter7s);
654       }
655       ierr = VecScatterBegin(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
656       ierr = VecScatterEnd(scatter7s, DyPs, DyPLocals, INSERT_VALUES, SCATTER_FORWARD);
657       ierr = VecGetArray(DyPLocals, &parray);
658       PetscRow = 0;
659       for ( node = 0; node< nshg; node++) {
660           index = node;
661           Dy[index] = parray[PetscRow];
662           PetscRow = PetscRow+1;
663       }
664       ierr = VecRestoreArray(DyPLocals, &parray);
665 
666       firstpetsccalls = 0;
667 
668 // .... output the statistics
669 //
670 //      itrpar.iKss=0; // see rstat()
671       PetscInt its;
672       ierr = KSPGetIterationNumber(ksps, &its);
673       itrpar.iKss = (int) its;
674       itrpar.ntotGMs += (int) its;
675       rstatpSclr (&resNrm);
676 //
677 // .... end
678 //
679 }
680 #endif
681