1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmapiv.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 17 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer) 18 { 19 PetscErrorCode ierr; 20 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 21 Mat_APMPI *ptap=a->ap; 22 PetscBool iascii; 23 PetscViewerFormat format; 24 25 PetscFunctionBegin; 26 if (!ptap) { 27 /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */ 28 A->ops->view = MatView_MPIAIJ; 29 ierr = (A->ops->view)(A,viewer);CHKERRQ(ierr); 30 PetscFunctionReturn(0); 31 } 32 33 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 34 if (iascii) { 35 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 36 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 37 if (ptap->algType == 0) { 38 ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr); 39 } else if (ptap->algType == 1) { 40 ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr); 41 } else if (ptap->algType == 2) { 42 ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 43 } else if (ptap->algType == 3) { 44 ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 45 } 46 } 47 } 48 ierr = (ptap->view)(A,viewer);CHKERRQ(ierr); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A) 53 { 54 PetscErrorCode ierr; 55 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 56 Mat_APMPI *ptap=a->ap; 57 Mat_Merge_SeqsToMPI *merge; 58 59 PetscFunctionBegin; 60 if (!ptap) PetscFunctionReturn(0); 61 62 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 63 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 64 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 65 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 66 ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */ 67 ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr); 68 ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr); 69 if (ptap->AP_loc) { /* used by alg_rap */ 70 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 71 ierr = PetscFree(ap->i);CHKERRQ(ierr); 72 ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr); 73 ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr); 74 } else { /* used by alg_ptap */ 75 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 76 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 77 } 78 ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr); 79 ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr); 80 if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);} 81 82 ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr); 83 84 merge=ptap->merge; 85 if (merge) { /* used by alg_ptap */ 86 ierr = PetscFree(merge->id_r);CHKERRQ(ierr); 87 ierr = PetscFree(merge->len_s);CHKERRQ(ierr); 88 ierr = PetscFree(merge->len_r);CHKERRQ(ierr); 89 ierr = PetscFree(merge->bi);CHKERRQ(ierr); 90 ierr = PetscFree(merge->bj);CHKERRQ(ierr); 91 ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr); 92 ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr); 93 ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr); 94 ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr); 95 ierr = PetscFree(merge->coi);CHKERRQ(ierr); 96 ierr = PetscFree(merge->coj);CHKERRQ(ierr); 97 ierr = PetscFree(merge->owners_co);CHKERRQ(ierr); 98 ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr); 99 ierr = PetscFree(ptap->merge);CHKERRQ(ierr); 100 } 101 ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr); 102 103 ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr); 104 ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr); 105 ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 110 { 111 PetscErrorCode ierr; 112 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 113 Mat_APMPI *ptap=a->ap; 114 115 PetscFunctionBegin; 116 ierr = (*A->ops->freeintermediatedatastructures)(A);CHKERRQ(ierr); 117 ierr = (*ptap->destroy)(A);CHKERRQ(ierr); /* MatDestroy_MPIAIJ(A) */ 118 ierr = PetscFree(ptap);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 123 { 124 PetscErrorCode ierr; 125 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 126 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 127 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 128 Mat_APMPI *ptap = c->ap; 129 Mat AP_loc,C_loc,C_oth; 130 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout; 131 PetscScalar *apa; 132 const PetscInt *cols; 133 const PetscScalar *vals; 134 135 PetscFunctionBegin; 136 if (!ptap->AP_loc) { 137 MPI_Comm comm; 138 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 139 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 140 } 141 142 ierr = MatZeroEntries(C);CHKERRQ(ierr); 143 144 /* 1) get R = Pd^T,Ro = Po^T */ 145 if (ptap->reuse == MAT_REUSE_MATRIX) { 146 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 147 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 148 } 149 150 /* 2) get AP_loc */ 151 AP_loc = ptap->AP_loc; 152 ap = (Mat_SeqAIJ*)AP_loc->data; 153 154 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 155 /*-----------------------------------------------------*/ 156 if (ptap->reuse == MAT_REUSE_MATRIX) { 157 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 158 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 159 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 160 } 161 162 /* 2-2) compute numeric A_loc*P - dominating part */ 163 /* ---------------------------------------------- */ 164 /* get data from symbolic products */ 165 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 166 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 167 168 api = ap->i; 169 apj = ap->j; 170 ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr); 171 for (i=0; i<am; i++) { 172 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 173 apnz = api[i+1] - api[i]; 174 apa = ap->a + api[i]; 175 ierr = PetscArrayzero(apa,apnz);CHKERRQ(ierr); 176 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 177 } 178 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr); 179 if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout); 180 181 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 182 /* Always use scalable version since we are in the MPI scalable version */ 183 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 184 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 185 186 C_loc = ptap->C_loc; 187 C_oth = ptap->C_oth; 188 189 /* add C_loc and Co to to C */ 190 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 191 192 /* C_loc -> C */ 193 cm = C_loc->rmap->N; 194 c_seq = (Mat_SeqAIJ*)C_loc->data; 195 cols = c_seq->j; 196 vals = c_seq->a; 197 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 198 199 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 200 /* when there are no off-processor parts. */ 201 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 202 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 203 /* a table, and other, more complex stuff has to be done. */ 204 if (C->assembled) { 205 C->was_assembled = PETSC_TRUE; 206 C->assembled = PETSC_FALSE; 207 } 208 if (C->was_assembled) { 209 for (i=0; i<cm; i++) { 210 ncols = c_seq->i[i+1] - c_seq->i[i]; 211 row = rstart + i; 212 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 213 cols += ncols; vals += ncols; 214 } 215 } else { 216 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 217 } 218 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 219 if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 220 221 /* Co -> C, off-processor part */ 222 cm = C_oth->rmap->N; 223 c_seq = (Mat_SeqAIJ*)C_oth->data; 224 cols = c_seq->j; 225 vals = c_seq->a; 226 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 227 for (i=0; i<cm; i++) { 228 ncols = c_seq->i[i+1] - c_seq->i[i]; 229 row = p->garray[i]; 230 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 231 cols += ncols; vals += ncols; 232 } 233 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 234 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 235 236 ptap->reuse = MAT_REUSE_MATRIX; 237 238 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 239 if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 240 241 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 242 if (ptap->freestruct) { 243 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi) 249 { 250 PetscErrorCode ierr; 251 Mat_APMPI *ptap; 252 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 253 MPI_Comm comm; 254 PetscMPIInt size,rank; 255 Mat P_loc,P_oth; 256 PetscFreeSpaceList free_space=NULL,current_space=NULL; 257 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 258 PetscInt *lnk,i,k,pnz,row,nsend; 259 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 260 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 261 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 262 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 263 MPI_Request *swaits,*rwaits; 264 MPI_Status *sstatus,rstatus; 265 PetscLayout rowmap; 266 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 267 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 268 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout; 269 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 270 PetscScalar *apv; 271 PetscTable ta; 272 MatType mtype; 273 const char *prefix; 274 #if defined(PETSC_USE_INFO) 275 PetscReal apfill; 276 #endif 277 278 PetscFunctionBegin; 279 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 280 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 281 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 282 283 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 284 285 /* create symbolic parallel matrix Cmpi */ 286 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 287 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 288 289 /* create struct Mat_APMPI and attached it to C later */ 290 ierr = PetscNew(&ptap);CHKERRQ(ierr); 291 ptap->reuse = MAT_INITIAL_MATRIX; 292 ptap->algType = 0; 293 294 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 295 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr); 296 /* get P_loc by taking all local rows of P */ 297 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr); 298 299 ptap->P_loc = P_loc; 300 ptap->P_oth = P_oth; 301 302 /* (0) compute Rd = Pd^T, Ro = Po^T */ 303 /* --------------------------------- */ 304 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 305 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 306 307 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 308 /* ----------------------------------------------------------------- */ 309 p_loc = (Mat_SeqAIJ*)P_loc->data; 310 if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data; 311 312 /* create and initialize a linked list */ 313 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 314 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 315 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 316 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 317 318 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 319 320 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 321 if (ao) { 322 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 323 } else { 324 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 325 } 326 current_space = free_space; 327 nspacedouble = 0; 328 329 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 330 api[0] = 0; 331 for (i=0; i<am; i++) { 332 /* diagonal portion: Ad[i,:]*P */ 333 ai = ad->i; pi = p_loc->i; 334 nzi = ai[i+1] - ai[i]; 335 aj = ad->j + ai[i]; 336 for (j=0; j<nzi; j++) { 337 row = aj[j]; 338 pnz = pi[row+1] - pi[row]; 339 Jptr = p_loc->j + pi[row]; 340 /* add non-zero cols of P into the sorted linked list lnk */ 341 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 342 } 343 /* off-diagonal portion: Ao[i,:]*P */ 344 if (ao) { 345 ai = ao->i; pi = p_oth->i; 346 nzi = ai[i+1] - ai[i]; 347 aj = ao->j + ai[i]; 348 for (j=0; j<nzi; j++) { 349 row = aj[j]; 350 pnz = pi[row+1] - pi[row]; 351 Jptr = p_oth->j + pi[row]; 352 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 353 } 354 } 355 apnz = lnk[0]; 356 api[i+1] = api[i] + apnz; 357 358 /* if free space is not available, double the total space in the list */ 359 if (current_space->local_remaining<apnz) { 360 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 361 nspacedouble++; 362 } 363 364 /* Copy data into free space, then initialize lnk */ 365 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 366 367 current_space->array += apnz; 368 current_space->local_used += apnz; 369 current_space->local_remaining -= apnz; 370 } 371 /* Allocate space for apj and apv, initialize apj, and */ 372 /* destroy list of free space and other temporary array(s) */ 373 ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 374 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 375 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 376 377 /* Create AP_loc for reuse */ 378 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 379 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr); 380 381 #if defined(PETSC_USE_INFO) 382 if (ao) { 383 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 384 } else { 385 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 386 } 387 ptap->AP_loc->info.mallocs = nspacedouble; 388 ptap->AP_loc->info.fill_ratio_given = fill; 389 ptap->AP_loc->info.fill_ratio_needed = apfill; 390 391 if (api[am]) { 392 ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 393 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 394 } else { 395 ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 396 } 397 #endif 398 399 /* (2-1) compute symbolic Co = Ro*AP_loc */ 400 /* -------------------------------------- */ 401 ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr); 402 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 403 ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr); 404 ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_");CHKERRQ(ierr); 405 406 ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr); 407 ierr = MatProductSetAlgorithm(ptap->C_oth,"sorted");CHKERRQ(ierr); 408 ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr); 409 ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr); 410 ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr); 411 412 /* (3) send coj of C_oth to other processors */ 413 /* ------------------------------------------ */ 414 /* determine row ownership */ 415 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 416 rowmap->n = pn; 417 rowmap->bs = 1; 418 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 419 owners = rowmap->range; 420 421 /* determine the number of messages to send, their lengths */ 422 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 423 ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr); 424 ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr); 425 426 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 427 coi = c_oth->i; coj = c_oth->j; 428 con = ptap->C_oth->rmap->n; 429 proc = 0; 430 ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr); 431 for (i=0; i<con; i++) { 432 while (prmap[i] >= owners[proc+1]) proc++; 433 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 434 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 435 } 436 437 len = 0; /* max length of buf_si[], see (4) */ 438 owners_co[0] = 0; 439 nsend = 0; 440 for (proc=0; proc<size; proc++) { 441 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 442 if (len_s[proc]) { 443 nsend++; 444 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 445 len += len_si[proc]; 446 } 447 } 448 449 /* determine the number and length of messages to receive for coi and coj */ 450 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 451 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 452 453 /* post the Irecv and Isend of coj */ 454 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 455 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 456 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 457 for (proc=0, k=0; proc<size; proc++) { 458 if (!len_s[proc]) continue; 459 i = owners_co[proc]; 460 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 461 k++; 462 } 463 464 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 465 /* ---------------------------------------- */ 466 ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr); 467 ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr); 468 ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr); 469 ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr); 470 471 ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr); 472 ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_");CHKERRQ(ierr); 473 474 ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr); 475 ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr); 476 477 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 478 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr); 479 480 /* receives coj are complete */ 481 for (i=0; i<nrecv; i++) { 482 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 483 } 484 ierr = PetscFree(rwaits);CHKERRQ(ierr); 485 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 486 487 /* add received column indices into ta to update Crmax */ 488 for (k=0; k<nrecv; k++) {/* k-th received message */ 489 Jptr = buf_rj[k]; 490 for (j=0; j<len_r[k]; j++) { 491 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 492 } 493 } 494 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 495 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 496 497 /* (4) send and recv coi */ 498 /*-----------------------*/ 499 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 500 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 501 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 502 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 503 for (proc=0,k=0; proc<size; proc++) { 504 if (!len_s[proc]) continue; 505 /* form outgoing message for i-structure: 506 buf_si[0]: nrows to be sent 507 [1:nrows]: row index (global) 508 [nrows+1:2*nrows+1]: i-structure index 509 */ 510 /*-------------------------------------------*/ 511 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 512 buf_si_i = buf_si + nrows+1; 513 buf_si[0] = nrows; 514 buf_si_i[0] = 0; 515 nrows = 0; 516 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 517 nzi = coi[i+1] - coi[i]; 518 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 519 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 520 nrows++; 521 } 522 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 523 k++; 524 buf_si += len_si[proc]; 525 } 526 for (i=0; i<nrecv; i++) { 527 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 528 } 529 ierr = PetscFree(rwaits);CHKERRQ(ierr); 530 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 531 532 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 533 ierr = PetscFree(len_ri);CHKERRQ(ierr); 534 ierr = PetscFree(swaits);CHKERRQ(ierr); 535 ierr = PetscFree(buf_s);CHKERRQ(ierr); 536 537 /* (5) compute the local portion of Cmpi */ 538 /* ------------------------------------------ */ 539 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 540 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 541 current_space = free_space; 542 543 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 544 for (k=0; k<nrecv; k++) { 545 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 546 nrows = *buf_ri_k[k]; 547 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 548 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 549 } 550 551 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 552 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 553 for (i=0; i<pn; i++) { 554 /* add C_loc into Cmpi */ 555 nzi = c_loc->i[i+1] - c_loc->i[i]; 556 Jptr = c_loc->j + c_loc->i[i]; 557 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 558 559 /* add received col data into lnk */ 560 for (k=0; k<nrecv; k++) { /* k-th received message */ 561 if (i == *nextrow[k]) { /* i-th row */ 562 nzi = *(nextci[k]+1) - *nextci[k]; 563 Jptr = buf_rj[k] + *nextci[k]; 564 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 565 nextrow[k]++; nextci[k]++; 566 } 567 } 568 nzi = lnk[0]; 569 570 /* copy data into free space, then initialize lnk */ 571 ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr); 572 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 573 } 574 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 575 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 576 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 577 578 /* local sizes and preallocation */ 579 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 580 if (P->cmap->bs > 0) { 581 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 582 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 583 } 584 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 585 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 586 587 /* members in merge */ 588 ierr = PetscFree(id_r);CHKERRQ(ierr); 589 ierr = PetscFree(len_r);CHKERRQ(ierr); 590 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 591 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 592 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 593 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 594 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 595 596 /* attach the supporting struct to Cmpi for reuse */ 597 c = (Mat_MPIAIJ*)Cmpi->data; 598 c->ap = ptap; 599 ptap->duplicate = Cmpi->ops->duplicate; 600 ptap->destroy = Cmpi->ops->destroy; 601 ptap->view = Cmpi->ops->view; 602 603 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 604 Cmpi->assembled = PETSC_FALSE; 605 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 606 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 607 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 608 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 609 610 nout = 0; 611 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr); 612 if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout); 613 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr); 614 if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout); 615 616 PetscFunctionReturn(0); 617 } 618 619 PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht) 620 { 621 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 622 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 623 PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k; 624 PetscInt pcstart,pcend,column,offset; 625 PetscErrorCode ierr; 626 627 PetscFunctionBegin; 628 pcstart = P->cmap->rstart; 629 pcstart *= dof; 630 pcend = P->cmap->rend; 631 pcend *= dof; 632 /* diagonal portion: Ad[i,:]*P */ 633 ai = ad->i; 634 nzi = ai[i+1] - ai[i]; 635 aj = ad->j + ai[i]; 636 for (j=0; j<nzi; j++) { 637 row = aj[j]; 638 offset = row%dof; 639 row /= dof; 640 nzpi = pd->i[row+1] - pd->i[row]; 641 pj = pd->j + pd->i[row]; 642 for (k=0; k<nzpi; k++) { 643 ierr = PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);CHKERRQ(ierr); 644 } 645 } 646 /* off diag P */ 647 for (j=0; j<nzi; j++) { 648 row = aj[j]; 649 offset = row%dof; 650 row /= dof; 651 nzpi = po->i[row+1] - po->i[row]; 652 pj = po->j + po->i[row]; 653 for (k=0; k<nzpi; k++) { 654 ierr = PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);CHKERRQ(ierr); 655 } 656 } 657 658 /* off diagonal part: Ao[i, :]*P_oth */ 659 if (ao) { 660 ai = ao->i; 661 pi = p_oth->i; 662 nzi = ai[i+1] - ai[i]; 663 aj = ao->j + ai[i]; 664 for (j=0; j<nzi; j++) { 665 row = aj[j]; 666 offset = a->garray[row]%dof; 667 row = map[row]; 668 pnz = pi[row+1] - pi[row]; 669 p_othcols = p_oth->j + pi[row]; 670 for (col=0; col<pnz; col++) { 671 column = p_othcols[col] * dof + offset; 672 if (column>=pcstart && column<pcend) { 673 ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr); 674 } else { 675 ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr); 676 } 677 } 678 } 679 } /* end if (ao) */ 680 PetscFunctionReturn(0); 681 } 682 683 PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap) 684 { 685 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 686 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 687 PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset; 688 PetscScalar ra,*aa,*pa; 689 PetscErrorCode ierr; 690 691 PetscFunctionBegin; 692 pcstart = P->cmap->rstart; 693 pcstart *= dof; 694 695 /* diagonal portion: Ad[i,:]*P */ 696 ai = ad->i; 697 nzi = ai[i+1] - ai[i]; 698 aj = ad->j + ai[i]; 699 aa = ad->a + ai[i]; 700 for (j=0; j<nzi; j++) { 701 ra = aa[j]; 702 row = aj[j]; 703 offset = row%dof; 704 row /= dof; 705 nzpi = pd->i[row+1] - pd->i[row]; 706 pj = pd->j + pd->i[row]; 707 pa = pd->a + pd->i[row]; 708 for (k=0; k<nzpi; k++) { 709 ierr = PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);CHKERRQ(ierr); 710 } 711 ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr); 712 } 713 for (j=0; j<nzi; j++) { 714 ra = aa[j]; 715 row = aj[j]; 716 offset = row%dof; 717 row /= dof; 718 nzpi = po->i[row+1] - po->i[row]; 719 pj = po->j + po->i[row]; 720 pa = po->a + po->i[row]; 721 for (k=0; k<nzpi; k++) { 722 ierr = PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);CHKERRQ(ierr); 723 } 724 ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr); 725 } 726 727 /* off diagonal part: Ao[i, :]*P_oth */ 728 if (ao) { 729 ai = ao->i; 730 pi = p_oth->i; 731 nzi = ai[i+1] - ai[i]; 732 aj = ao->j + ai[i]; 733 aa = ao->a + ai[i]; 734 for (j=0; j<nzi; j++) { 735 row = aj[j]; 736 offset = a->garray[row]%dof; 737 row = map[row]; 738 ra = aa[j]; 739 pnz = pi[row+1] - pi[row]; 740 p_othcols = p_oth->j + pi[row]; 741 pa = p_oth->a + pi[row]; 742 for (col=0; col<pnz; col++) { 743 ierr = PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);CHKERRQ(ierr); 744 } 745 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 746 } 747 } /* end if (ao) */ 748 749 PetscFunctionReturn(0); 750 } 751 752 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*); 753 754 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C) 755 { 756 PetscErrorCode ierr; 757 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 758 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 759 Mat_APMPI *ptap = c->ap; 760 PetscHMapIV hmap; 761 PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc; 762 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 763 PetscInt offset,ii,pocol; 764 const PetscInt *mappingindices; 765 IS map; 766 MPI_Comm comm; 767 768 PetscFunctionBegin; 769 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 770 if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 771 772 ierr = MatZeroEntries(C);CHKERRQ(ierr); 773 774 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 775 /*-----------------------------------------------------*/ 776 if (ptap->reuse == MAT_REUSE_MATRIX) { 777 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 778 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 779 } 780 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 781 782 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 783 pon *= dof; 784 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 785 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 786 cmaxr = 0; 787 for (i=0; i<pon; i++) { 788 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 789 } 790 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 791 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 792 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 793 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 794 for (i=0; i<am && pon; i++) { 795 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 796 offset = i%dof; 797 ii = i/dof; 798 nzi = po->i[ii+1] - po->i[ii]; 799 if (!nzi) continue; 800 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 801 voff = 0; 802 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 803 if (!voff) continue; 804 805 /* Form C(ii, :) */ 806 poj = po->j + po->i[ii]; 807 poa = po->a + po->i[ii]; 808 for (j=0; j<nzi; j++) { 809 pocol = poj[j]*dof+offset; 810 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 811 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 812 for (jj=0; jj<voff; jj++) { 813 apvaluestmp[jj] = apvalues[jj]*poa[j]; 814 /*If the row is empty */ 815 if (!c_rmtc[pocol]) { 816 c_rmtjj[jj] = apindices[jj]; 817 c_rmtaa[jj] = apvaluestmp[jj]; 818 c_rmtc[pocol]++; 819 } else { 820 ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr); 821 if (loc>=0){ /* hit */ 822 c_rmtaa[loc] += apvaluestmp[jj]; 823 ierr = PetscLogFlops(1.0);CHKERRQ(ierr); 824 } else { /* new element */ 825 loc = -(loc+1); 826 /* Move data backward */ 827 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 828 c_rmtjj[kk] = c_rmtjj[kk-1]; 829 c_rmtaa[kk] = c_rmtaa[kk-1]; 830 }/* End kk */ 831 c_rmtjj[loc] = apindices[jj]; 832 c_rmtaa[loc] = apvaluestmp[jj]; 833 c_rmtc[pocol]++; 834 } 835 } 836 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 837 } /* End jj */ 838 } /* End j */ 839 } /* End i */ 840 841 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 842 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 843 844 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 845 pn *= dof; 846 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 847 848 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 849 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 850 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 851 pcstart = pcstart*dof; 852 pcend = pcend*dof; 853 cd = (Mat_SeqAIJ*)(c->A)->data; 854 co = (Mat_SeqAIJ*)(c->B)->data; 855 856 cmaxr = 0; 857 for (i=0; i<pn; i++) { 858 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 859 } 860 ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr); 861 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 862 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 863 for (i=0; i<am && pn; i++) { 864 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 865 offset = i%dof; 866 ii = i/dof; 867 nzi = pd->i[ii+1] - pd->i[ii]; 868 if (!nzi) continue; 869 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 870 voff = 0; 871 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 872 if (!voff) continue; 873 /* Form C(ii, :) */ 874 pdj = pd->j + pd->i[ii]; 875 pda = pd->a + pd->i[ii]; 876 for (j=0; j<nzi; j++) { 877 row = pcstart + pdj[j] * dof + offset; 878 for (jj=0; jj<voff; jj++) { 879 apvaluestmp[jj] = apvalues[jj]*pda[j]; 880 } 881 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 882 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 883 } 884 } 885 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 886 ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr); 887 ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr); 888 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 889 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 890 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 891 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 892 893 /* Add contributions from remote */ 894 for (i = 0; i < pn; i++) { 895 row = i + pcstart; 896 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 897 } 898 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 899 900 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 901 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 902 903 ptap->reuse = MAT_REUSE_MATRIX; 904 905 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 906 if (ptap->freestruct) { 907 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 908 } 909 PetscFunctionReturn(0); 910 } 911 912 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C) 913 { 914 PetscErrorCode ierr; 915 916 PetscFunctionBegin; 917 918 ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);CHKERRQ(ierr); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C) 923 { 924 PetscErrorCode ierr; 925 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 926 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 927 Mat_APMPI *ptap = c->ap; 928 PetscHMapIV hmap; 929 PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc; 930 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 931 PetscInt offset,ii,pocol; 932 const PetscInt *mappingindices; 933 IS map; 934 MPI_Comm comm; 935 936 PetscFunctionBegin; 937 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 938 if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 939 940 ierr = MatZeroEntries(C);CHKERRQ(ierr); 941 942 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 943 /*-----------------------------------------------------*/ 944 if (ptap->reuse == MAT_REUSE_MATRIX) { 945 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 946 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 947 } 948 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 949 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 950 pon *= dof; 951 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 952 pn *= dof; 953 954 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 955 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 956 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 957 pcstart *= dof; 958 pcend *= dof; 959 cmaxr = 0; 960 for (i=0; i<pon; i++) { 961 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 962 } 963 cd = (Mat_SeqAIJ*)(c->A)->data; 964 co = (Mat_SeqAIJ*)(c->B)->data; 965 for (i=0; i<pn; i++) { 966 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 967 } 968 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 969 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 970 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 971 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 972 for (i=0; i<am && (pon || pn); i++) { 973 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 974 offset = i%dof; 975 ii = i/dof; 976 nzi = po->i[ii+1] - po->i[ii]; 977 dnzi = pd->i[ii+1] - pd->i[ii]; 978 if (!nzi && !dnzi) continue; 979 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 980 voff = 0; 981 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 982 if (!voff) continue; 983 984 /* Form remote C(ii, :) */ 985 poj = po->j + po->i[ii]; 986 poa = po->a + po->i[ii]; 987 for (j=0; j<nzi; j++) { 988 pocol = poj[j]*dof+offset; 989 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 990 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 991 for (jj=0; jj<voff; jj++) { 992 apvaluestmp[jj] = apvalues[jj]*poa[j]; 993 /*If the row is empty */ 994 if (!c_rmtc[pocol]) { 995 c_rmtjj[jj] = apindices[jj]; 996 c_rmtaa[jj] = apvaluestmp[jj]; 997 c_rmtc[pocol]++; 998 } else { 999 ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr); 1000 if (loc>=0){ /* hit */ 1001 c_rmtaa[loc] += apvaluestmp[jj]; 1002 ierr = PetscLogFlops(1.0);CHKERRQ(ierr); 1003 } else { /* new element */ 1004 loc = -(loc+1); 1005 /* Move data backward */ 1006 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 1007 c_rmtjj[kk] = c_rmtjj[kk-1]; 1008 c_rmtaa[kk] = c_rmtaa[kk-1]; 1009 }/* End kk */ 1010 c_rmtjj[loc] = apindices[jj]; 1011 c_rmtaa[loc] = apvaluestmp[jj]; 1012 c_rmtc[pocol]++; 1013 } 1014 } 1015 } /* End jj */ 1016 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 1017 } /* End j */ 1018 1019 /* Form local C(ii, :) */ 1020 pdj = pd->j + pd->i[ii]; 1021 pda = pd->a + pd->i[ii]; 1022 for (j=0; j<dnzi; j++) { 1023 row = pcstart + pdj[j] * dof + offset; 1024 for (jj=0; jj<voff; jj++) { 1025 apvaluestmp[jj] = apvalues[jj]*pda[j]; 1026 }/* End kk */ 1027 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 1028 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 1029 }/* End j */ 1030 } /* End i */ 1031 1032 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1033 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 1034 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 1035 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 1036 1037 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1038 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 1039 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1040 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 1041 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 1042 1043 /* Add contributions from remote */ 1044 for (i = 0; i < pn; i++) { 1045 row = i + pcstart; 1046 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 1047 } 1048 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 1049 1050 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1051 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1052 1053 ptap->reuse = MAT_REUSE_MATRIX; 1054 1055 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 1056 if (ptap->freestruct) { 1057 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 1058 } 1059 PetscFunctionReturn(0); 1060 } 1061 1062 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C) 1063 { 1064 PetscErrorCode ierr; 1065 1066 PetscFunctionBegin; 1067 1068 ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);CHKERRQ(ierr); 1069 PetscFunctionReturn(0); 1070 } 1071 1072 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi) 1073 { 1074 Mat_APMPI *ptap; 1075 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1076 MPI_Comm comm; 1077 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1078 MatType mtype; 1079 PetscSF sf; 1080 PetscSFNode *iremote; 1081 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1082 const PetscInt *rootdegrees; 1083 PetscHSetI ht,oht,*hta,*hto; 1084 PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1085 PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1086 PetscInt nalg=2,alg=0,offset,ii; 1087 PetscMPIInt owner; 1088 const PetscInt *mappingindices; 1089 PetscBool flg; 1090 const char *algTypes[2] = {"overlapping","merged"}; 1091 IS map; 1092 PetscErrorCode ierr; 1093 1094 PetscFunctionBegin; 1095 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1096 1097 /* Create symbolic parallel matrix Cmpi */ 1098 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1099 pn *= dof; 1100 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1101 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1102 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1103 1104 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1105 ptap->reuse = MAT_INITIAL_MATRIX; 1106 ptap->algType = 2; 1107 1108 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1109 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 1110 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 1111 /* This equals to the number of offdiag columns in P */ 1112 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1113 pon *= dof; 1114 /* offsets */ 1115 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1116 /* The number of columns we will send to remote ranks */ 1117 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1118 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1119 for (i=0; i<pon; i++) { 1120 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1121 } 1122 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1123 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1124 /* Create hash table to merge all columns for C(i, :) */ 1125 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1126 1127 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 1128 ptap->c_rmti[0] = 0; 1129 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1130 for (i=0; i<am && pon; i++) { 1131 /* Form one row of AP */ 1132 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1133 offset = i%dof; 1134 ii = i/dof; 1135 /* If the off diag is empty, we should not do any calculation */ 1136 nzi = po->i[ii+1] - po->i[ii]; 1137 if (!nzi) continue; 1138 1139 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);CHKERRQ(ierr); 1140 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1141 /* If AP is empty, just continue */ 1142 if (!htsize) continue; 1143 /* Form C(ii, :) */ 1144 poj = po->j + po->i[ii]; 1145 for (j=0; j<nzi; j++) { 1146 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr); 1147 } 1148 } 1149 1150 for (i=0; i<pon; i++) { 1151 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1152 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1153 c_rmtc[i] = htsize; 1154 } 1155 1156 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1157 1158 for (i=0; i<pon; i++) { 1159 off = 0; 1160 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1161 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1162 } 1163 ierr = PetscFree(hta);CHKERRQ(ierr); 1164 1165 ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr); 1166 for (i=0; i<pon; i++) { 1167 owner = 0; lidx = 0; 1168 offset = i%dof; 1169 ii = i/dof; 1170 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr); 1171 iremote[i].index = lidx*dof + offset; 1172 iremote[i].rank = owner; 1173 } 1174 1175 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1176 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1177 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1178 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1179 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1180 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1181 /* How many neighbors have contributions to my rows? */ 1182 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1183 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1184 rootspacesize = 0; 1185 for (i = 0; i < pn; i++) { 1186 rootspacesize += rootdegrees[i]; 1187 } 1188 ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1189 ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1190 /* Get information from leaves 1191 * Number of columns other people contribute to my rows 1192 * */ 1193 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1194 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1195 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1196 ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1197 /* The number of columns is received for each row */ 1198 ptap->c_othi[0] = 0; 1199 rootspacesize = 0; 1200 rootspaceoffsets[0] = 0; 1201 for (i = 0; i < pn; i++) { 1202 rcvncols = 0; 1203 for (j = 0; j<rootdegrees[i]; j++) { 1204 rcvncols += rootspace[rootspacesize]; 1205 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1206 rootspacesize++; 1207 } 1208 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1209 } 1210 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1211 1212 ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1213 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1214 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1215 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1216 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1217 1218 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1219 nleaves = 0; 1220 for (i = 0; i<pon; i++) { 1221 owner = 0; 1222 ii = i/dof; 1223 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr); 1224 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1225 for (j=0; j<sendncols; j++) { 1226 iremote[nleaves].rank = owner; 1227 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1228 } 1229 } 1230 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1231 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1232 1233 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1234 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1235 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1236 /* One to one map */ 1237 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1238 1239 ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1240 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1241 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1242 pcstart *= dof; 1243 pcend *= dof; 1244 ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr); 1245 for (i=0; i<pn; i++) { 1246 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1247 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1248 } 1249 /* Work on local part */ 1250 /* 4) Pass 1: Estimate memory for C_loc */ 1251 for (i=0; i<am && pn; i++) { 1252 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1253 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1254 offset = i%dof; 1255 ii = i/dof; 1256 nzi = pd->i[ii+1] - pd->i[ii]; 1257 if (!nzi) continue; 1258 1259 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr); 1260 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1261 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1262 if (!(htsize+htosize)) continue; 1263 /* Form C(ii, :) */ 1264 pdj = pd->j + pd->i[ii]; 1265 for (j=0; j<nzi; j++) { 1266 ierr = PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);CHKERRQ(ierr); 1267 ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr); 1268 } 1269 } 1270 1271 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1272 1273 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1274 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1275 1276 /* Get remote data */ 1277 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1278 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1279 1280 for (i = 0; i < pn; i++) { 1281 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1282 rdj = c_othj + ptap->c_othi[i]; 1283 for (j = 0; j < nzi; j++) { 1284 col = rdj[j]; 1285 /* diag part */ 1286 if (col>=pcstart && col<pcend) { 1287 ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr); 1288 } else { /* off diag */ 1289 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1290 } 1291 } 1292 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1293 dnz[i] = htsize; 1294 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1295 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1296 onz[i] = htsize; 1297 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1298 } 1299 1300 ierr = PetscFree2(hta,hto);CHKERRQ(ierr); 1301 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1302 1303 /* local sizes and preallocation */ 1304 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1305 ierr = MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr); 1306 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1307 ierr = MatSetUp(Cmpi);CHKERRQ(ierr); 1308 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1309 1310 /* attach the supporting struct to Cmpi for reuse */ 1311 c = (Mat_MPIAIJ*)Cmpi->data; 1312 c->ap = ptap; 1313 ptap->duplicate = Cmpi->ops->duplicate; 1314 ptap->destroy = Cmpi->ops->destroy; 1315 ptap->view = Cmpi->ops->view; 1316 1317 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1318 Cmpi->assembled = PETSC_FALSE; 1319 /* pick an algorithm */ 1320 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1321 alg = 0; 1322 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1323 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1324 switch (alg) { 1325 case 0: 1326 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1327 break; 1328 case 1: 1329 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1330 break; 1331 default: 1332 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1333 } 1334 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1335 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1336 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1337 PetscFunctionReturn(0); 1338 } 1339 1340 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C) 1341 { 1342 PetscErrorCode ierr; 1343 1344 PetscFunctionBegin; 1345 1346 ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);CHKERRQ(ierr); 1347 PetscFunctionReturn(0); 1348 } 1349 1350 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi) 1351 { 1352 Mat_APMPI *ptap; 1353 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1354 MPI_Comm comm; 1355 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1356 MatType mtype; 1357 PetscSF sf; 1358 PetscSFNode *iremote; 1359 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1360 const PetscInt *rootdegrees; 1361 PetscHSetI ht,oht,*hta,*hto,*htd; 1362 PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1363 PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1364 PetscInt nalg=2,alg=0,offset,ii; 1365 PetscMPIInt owner; 1366 PetscBool flg; 1367 const char *algTypes[2] = {"merged","overlapping"}; 1368 const PetscInt *mappingindices; 1369 IS map; 1370 PetscErrorCode ierr; 1371 1372 PetscFunctionBegin; 1373 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1374 1375 /* Create symbolic parallel matrix Cmpi */ 1376 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1377 pn *= dof; 1378 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1379 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1380 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1381 1382 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1383 ptap->reuse = MAT_INITIAL_MATRIX; 1384 ptap->algType = 3; 1385 1386 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1387 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 1388 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 1389 1390 /* This equals to the number of offdiag columns in P */ 1391 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1392 pon *= dof; 1393 /* offsets */ 1394 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1395 /* The number of columns we will send to remote ranks */ 1396 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1397 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1398 for (i=0; i<pon; i++) { 1399 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1400 } 1401 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1402 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1403 /* Create hash table to merge all columns for C(i, :) */ 1404 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1405 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1406 ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr); 1407 for (i=0; i<pn; i++) { 1408 ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr); 1409 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1410 } 1411 1412 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 1413 ptap->c_rmti[0] = 0; 1414 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1415 for (i=0; i<am && (pon || pn); i++) { 1416 /* Form one row of AP */ 1417 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1418 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1419 offset = i%dof; 1420 ii = i/dof; 1421 /* If the off diag is empty, we should not do any calculation */ 1422 nzi = po->i[ii+1] - po->i[ii]; 1423 dnzi = pd->i[ii+1] - pd->i[ii]; 1424 if (!nzi && !dnzi) continue; 1425 1426 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr); 1427 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1428 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1429 /* If AP is empty, just continue */ 1430 if (!(htsize+htosize)) continue; 1431 1432 /* Form remote C(ii, :) */ 1433 poj = po->j + po->i[ii]; 1434 for (j=0; j<nzi; j++) { 1435 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr); 1436 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);CHKERRQ(ierr); 1437 } 1438 1439 /* Form local C(ii, :) */ 1440 pdj = pd->j + pd->i[ii]; 1441 for (j=0; j<dnzi; j++) { 1442 ierr = PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);CHKERRQ(ierr); 1443 ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr); 1444 } 1445 } 1446 1447 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1448 1449 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1450 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1451 1452 for (i=0; i<pon; i++) { 1453 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1454 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1455 c_rmtc[i] = htsize; 1456 } 1457 1458 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1459 1460 for (i=0; i<pon; i++) { 1461 off = 0; 1462 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1463 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1464 } 1465 ierr = PetscFree(hta);CHKERRQ(ierr); 1466 1467 ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr); 1468 for (i=0; i<pon; i++) { 1469 owner = 0; lidx = 0; 1470 offset = i%dof; 1471 ii = i/dof; 1472 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr); 1473 iremote[i].index = lidx*dof+offset; 1474 iremote[i].rank = owner; 1475 } 1476 1477 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1478 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1479 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1480 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1481 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1482 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1483 /* How many neighbors have contributions to my rows? */ 1484 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1485 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1486 rootspacesize = 0; 1487 for (i = 0; i < pn; i++) { 1488 rootspacesize += rootdegrees[i]; 1489 } 1490 ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1491 ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1492 /* Get information from leaves 1493 * Number of columns other people contribute to my rows 1494 * */ 1495 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1496 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1497 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1498 ierr = PetscMalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1499 /* The number of columns is received for each row */ 1500 ptap->c_othi[0] = 0; 1501 rootspacesize = 0; 1502 rootspaceoffsets[0] = 0; 1503 for (i = 0; i < pn; i++) { 1504 rcvncols = 0; 1505 for (j = 0; j<rootdegrees[i]; j++) { 1506 rcvncols += rootspace[rootspacesize]; 1507 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1508 rootspacesize++; 1509 } 1510 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1511 } 1512 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1513 1514 ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1515 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1516 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1517 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1518 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1519 1520 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1521 nleaves = 0; 1522 for (i = 0; i<pon; i++) { 1523 owner = 0; 1524 ii = i/dof; 1525 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr); 1526 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1527 for (j=0; j<sendncols; j++) { 1528 iremote[nleaves].rank = owner; 1529 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1530 } 1531 } 1532 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1533 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1534 1535 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1536 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1537 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1538 /* One to one map */ 1539 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1540 /* Get remote data */ 1541 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1542 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1543 ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1544 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1545 pcstart *= dof; 1546 pcend *= dof; 1547 for (i = 0; i < pn; i++) { 1548 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1549 rdj = c_othj + ptap->c_othi[i]; 1550 for (j = 0; j < nzi; j++) { 1551 col = rdj[j]; 1552 /* diag part */ 1553 if (col>=pcstart && col<pcend) { 1554 ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr); 1555 } else { /* off diag */ 1556 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1557 } 1558 } 1559 ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr); 1560 dnz[i] = htsize; 1561 ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr); 1562 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1563 onz[i] = htsize; 1564 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1565 } 1566 1567 ierr = PetscFree2(htd,hto);CHKERRQ(ierr); 1568 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1569 1570 /* local sizes and preallocation */ 1571 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1572 ierr = MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr); 1573 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1574 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1575 1576 /* attach the supporting struct to Cmpi for reuse */ 1577 c = (Mat_MPIAIJ*)Cmpi->data; 1578 c->ap = ptap; 1579 ptap->duplicate = Cmpi->ops->duplicate; 1580 ptap->destroy = Cmpi->ops->destroy; 1581 ptap->view = Cmpi->ops->view; 1582 1583 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1584 Cmpi->assembled = PETSC_FALSE; 1585 /* pick an algorithm */ 1586 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1587 alg = 0; 1588 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1589 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1590 switch (alg) { 1591 case 0: 1592 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1593 break; 1594 case 1: 1595 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1596 break; 1597 default: 1598 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1599 } 1600 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1601 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1602 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1603 PetscFunctionReturn(0); 1604 } 1605 1606 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C) 1607 { 1608 PetscErrorCode ierr; 1609 1610 PetscFunctionBegin; 1611 1612 ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);CHKERRQ(ierr); 1613 PetscFunctionReturn(0); 1614 } 1615 1616 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi) 1617 { 1618 PetscErrorCode ierr; 1619 Mat_APMPI *ptap; 1620 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 1621 MPI_Comm comm; 1622 PetscMPIInt size,rank; 1623 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1624 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 1625 PetscInt *lnk,i,k,pnz,row,nsend; 1626 PetscBT lnkbt; 1627 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 1628 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1629 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 1630 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1631 MPI_Request *swaits,*rwaits; 1632 MPI_Status *sstatus,rstatus; 1633 PetscLayout rowmap; 1634 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1635 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 1636 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 1637 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 1638 PetscScalar *apv; 1639 PetscTable ta; 1640 MatType mtype; 1641 const char *prefix; 1642 #if defined(PETSC_USE_INFO) 1643 PetscReal apfill; 1644 #endif 1645 1646 PetscFunctionBegin; 1647 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1648 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1649 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1650 1651 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 1652 1653 /* create symbolic parallel matrix Cmpi */ 1654 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1655 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1656 1657 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1658 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1659 1660 /* create struct Mat_APMPI and attached it to C later */ 1661 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1662 ptap->reuse = MAT_INITIAL_MATRIX; 1663 ptap->algType = 1; 1664 1665 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1666 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1667 /* get P_loc by taking all local rows of P */ 1668 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1669 1670 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1671 /* --------------------------------- */ 1672 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1673 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1674 1675 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1676 /* ----------------------------------------------------------------- */ 1677 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1678 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1679 1680 /* create and initialize a linked list */ 1681 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 1682 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 1683 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 1684 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 1685 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1686 1687 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1688 1689 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1690 if (ao) { 1691 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 1692 } else { 1693 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 1694 } 1695 current_space = free_space; 1696 nspacedouble = 0; 1697 1698 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 1699 api[0] = 0; 1700 for (i=0; i<am; i++) { 1701 /* diagonal portion: Ad[i,:]*P */ 1702 ai = ad->i; pi = p_loc->i; 1703 nzi = ai[i+1] - ai[i]; 1704 aj = ad->j + ai[i]; 1705 for (j=0; j<nzi; j++) { 1706 row = aj[j]; 1707 pnz = pi[row+1] - pi[row]; 1708 Jptr = p_loc->j + pi[row]; 1709 /* add non-zero cols of P into the sorted linked list lnk */ 1710 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1711 } 1712 /* off-diagonal portion: Ao[i,:]*P */ 1713 if (ao) { 1714 ai = ao->i; pi = p_oth->i; 1715 nzi = ai[i+1] - ai[i]; 1716 aj = ao->j + ai[i]; 1717 for (j=0; j<nzi; j++) { 1718 row = aj[j]; 1719 pnz = pi[row+1] - pi[row]; 1720 Jptr = p_oth->j + pi[row]; 1721 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1722 } 1723 } 1724 apnz = lnk[0]; 1725 api[i+1] = api[i] + apnz; 1726 if (ap_rmax < apnz) ap_rmax = apnz; 1727 1728 /* if free space is not available, double the total space in the list */ 1729 if (current_space->local_remaining<apnz) { 1730 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1731 nspacedouble++; 1732 } 1733 1734 /* Copy data into free space, then initialize lnk */ 1735 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1736 1737 current_space->array += apnz; 1738 current_space->local_used += apnz; 1739 current_space->local_remaining -= apnz; 1740 } 1741 /* Allocate space for apj and apv, initialize apj, and */ 1742 /* destroy list of free space and other temporary array(s) */ 1743 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 1744 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 1745 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1746 1747 /* Create AP_loc for reuse */ 1748 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 1749 1750 #if defined(PETSC_USE_INFO) 1751 if (ao) { 1752 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 1753 } else { 1754 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 1755 } 1756 ptap->AP_loc->info.mallocs = nspacedouble; 1757 ptap->AP_loc->info.fill_ratio_given = fill; 1758 ptap->AP_loc->info.fill_ratio_needed = apfill; 1759 1760 if (api[am]) { 1761 ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 1762 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 1763 } else { 1764 ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 1765 } 1766 #endif 1767 1768 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1769 /* ------------------------------------ */ 1770 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 1771 ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr); 1772 ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 1773 1774 ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr); 1775 ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr); 1776 ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr); 1777 ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_");CHKERRQ(ierr); 1778 1779 ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr); 1780 ierr = MatProductSetAlgorithm(ptap->C_oth,"default");CHKERRQ(ierr); 1781 ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr); 1782 ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr); 1783 ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr); 1784 1785 /* (3) send coj of C_oth to other processors */ 1786 /* ------------------------------------------ */ 1787 /* determine row ownership */ 1788 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 1789 rowmap->n = pn; 1790 rowmap->bs = 1; 1791 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 1792 owners = rowmap->range; 1793 1794 /* determine the number of messages to send, their lengths */ 1795 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 1796 ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr); 1797 ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr); 1798 1799 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 1800 coi = c_oth->i; coj = c_oth->j; 1801 con = ptap->C_oth->rmap->n; 1802 proc = 0; 1803 for (i=0; i<con; i++) { 1804 while (prmap[i] >= owners[proc+1]) proc++; 1805 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1806 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1807 } 1808 1809 len = 0; /* max length of buf_si[], see (4) */ 1810 owners_co[0] = 0; 1811 nsend = 0; 1812 for (proc=0; proc<size; proc++) { 1813 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1814 if (len_s[proc]) { 1815 nsend++; 1816 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1817 len += len_si[proc]; 1818 } 1819 } 1820 1821 /* determine the number and length of messages to receive for coi and coj */ 1822 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 1823 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 1824 1825 /* post the Irecv and Isend of coj */ 1826 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1827 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1828 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 1829 for (proc=0, k=0; proc<size; proc++) { 1830 if (!len_s[proc]) continue; 1831 i = owners_co[proc]; 1832 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1833 k++; 1834 } 1835 1836 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1837 /* ---------------------------------------- */ 1838 ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr); 1839 ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 1840 1841 ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr); 1842 ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr); 1843 ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr); 1844 ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_");CHKERRQ(ierr); 1845 1846 ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr); 1847 ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr); 1848 ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr); 1849 ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr); 1850 ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr); 1851 1852 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 1853 1854 /* receives coj are complete */ 1855 for (i=0; i<nrecv; i++) { 1856 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1857 } 1858 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1859 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 1860 1861 /* add received column indices into ta to update Crmax */ 1862 for (k=0; k<nrecv; k++) {/* k-th received message */ 1863 Jptr = buf_rj[k]; 1864 for (j=0; j<len_r[k]; j++) { 1865 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 1866 } 1867 } 1868 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 1869 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 1870 1871 /* (4) send and recv coi */ 1872 /*-----------------------*/ 1873 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1874 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1875 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 1876 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1877 for (proc=0,k=0; proc<size; proc++) { 1878 if (!len_s[proc]) continue; 1879 /* form outgoing message for i-structure: 1880 buf_si[0]: nrows to be sent 1881 [1:nrows]: row index (global) 1882 [nrows+1:2*nrows+1]: i-structure index 1883 */ 1884 /*-------------------------------------------*/ 1885 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 1886 buf_si_i = buf_si + nrows+1; 1887 buf_si[0] = nrows; 1888 buf_si_i[0] = 0; 1889 nrows = 0; 1890 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1891 nzi = coi[i+1] - coi[i]; 1892 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1893 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1894 nrows++; 1895 } 1896 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1897 k++; 1898 buf_si += len_si[proc]; 1899 } 1900 for (i=0; i<nrecv; i++) { 1901 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1902 } 1903 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1904 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 1905 1906 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 1907 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1908 ierr = PetscFree(swaits);CHKERRQ(ierr); 1909 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1910 1911 /* (5) compute the local portion of Cmpi */ 1912 /* ------------------------------------------ */ 1913 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1914 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 1915 current_space = free_space; 1916 1917 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 1918 for (k=0; k<nrecv; k++) { 1919 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1920 nrows = *buf_ri_k[k]; 1921 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1922 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1923 } 1924 1925 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 1926 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1927 for (i=0; i<pn; i++) { 1928 /* add C_loc into Cmpi */ 1929 nzi = c_loc->i[i+1] - c_loc->i[i]; 1930 Jptr = c_loc->j + c_loc->i[i]; 1931 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1932 1933 /* add received col data into lnk */ 1934 for (k=0; k<nrecv; k++) { /* k-th received message */ 1935 if (i == *nextrow[k]) { /* i-th row */ 1936 nzi = *(nextci[k]+1) - *nextci[k]; 1937 Jptr = buf_rj[k] + *nextci[k]; 1938 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1939 nextrow[k]++; nextci[k]++; 1940 } 1941 } 1942 nzi = lnk[0]; 1943 1944 /* copy data into free space, then initialize lnk */ 1945 ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1946 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 1947 } 1948 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1949 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1950 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 1951 1952 /* local sizes and preallocation */ 1953 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1954 if (P->cmap->bs > 0) { 1955 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 1956 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 1957 } 1958 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1959 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1960 1961 /* members in merge */ 1962 ierr = PetscFree(id_r);CHKERRQ(ierr); 1963 ierr = PetscFree(len_r);CHKERRQ(ierr); 1964 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 1965 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 1966 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 1967 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 1968 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 1969 1970 /* attach the supporting struct to Cmpi for reuse */ 1971 c = (Mat_MPIAIJ*)Cmpi->data; 1972 c->ap = ptap; 1973 ptap->duplicate = Cmpi->ops->duplicate; 1974 ptap->destroy = Cmpi->ops->destroy; 1975 ptap->view = Cmpi->ops->view; 1976 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 1977 1978 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1979 Cmpi->assembled = PETSC_FALSE; 1980 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1981 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1982 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1983 PetscFunctionReturn(0); 1984 } 1985 1986 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 1987 { 1988 PetscErrorCode ierr; 1989 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1990 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1991 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 1992 Mat_APMPI *ptap = c->ap; 1993 Mat AP_loc,C_loc,C_oth; 1994 PetscInt i,rstart,rend,cm,ncols,row; 1995 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 1996 PetscScalar *apa; 1997 const PetscInt *cols; 1998 const PetscScalar *vals; 1999 2000 PetscFunctionBegin; 2001 if (!ptap->AP_loc) { 2002 MPI_Comm comm; 2003 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2004 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 2005 } 2006 2007 ierr = MatZeroEntries(C);CHKERRQ(ierr); 2008 /* 1) get R = Pd^T,Ro = Po^T */ 2009 if (ptap->reuse == MAT_REUSE_MATRIX) { 2010 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 2011 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 2012 } 2013 2014 /* 2) get AP_loc */ 2015 AP_loc = ptap->AP_loc; 2016 ap = (Mat_SeqAIJ*)AP_loc->data; 2017 2018 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 2019 /*-----------------------------------------------------*/ 2020 if (ptap->reuse == MAT_REUSE_MATRIX) { 2021 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 2022 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 2023 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 2024 } 2025 2026 /* 2-2) compute numeric A_loc*P - dominating part */ 2027 /* ---------------------------------------------- */ 2028 /* get data from symbolic products */ 2029 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 2030 if (ptap->P_oth) { 2031 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 2032 } 2033 apa = ptap->apa; 2034 api = ap->i; 2035 apj = ap->j; 2036 for (i=0; i<am; i++) { 2037 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 2038 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 2039 apnz = api[i+1] - api[i]; 2040 for (j=0; j<apnz; j++) { 2041 col = apj[j+api[i]]; 2042 ap->a[j+ap->i[i]] = apa[col]; 2043 apa[col] = 0.0; 2044 } 2045 } 2046 2047 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 2048 ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 2049 ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 2050 C_loc = ptap->C_loc; 2051 C_oth = ptap->C_oth; 2052 2053 /* add C_loc and Co to to C */ 2054 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 2055 2056 /* C_loc -> C */ 2057 cm = C_loc->rmap->N; 2058 c_seq = (Mat_SeqAIJ*)C_loc->data; 2059 cols = c_seq->j; 2060 vals = c_seq->a; 2061 2062 2063 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 2064 /* when there are no off-processor parts. */ 2065 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 2066 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 2067 /* a table, and other, more complex stuff has to be done. */ 2068 if (C->assembled) { 2069 C->was_assembled = PETSC_TRUE; 2070 C->assembled = PETSC_FALSE; 2071 } 2072 if (C->was_assembled) { 2073 for (i=0; i<cm; i++) { 2074 ncols = c_seq->i[i+1] - c_seq->i[i]; 2075 row = rstart + i; 2076 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2077 cols += ncols; vals += ncols; 2078 } 2079 } else { 2080 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 2081 } 2082 2083 /* Co -> C, off-processor part */ 2084 cm = C_oth->rmap->N; 2085 c_seq = (Mat_SeqAIJ*)C_oth->data; 2086 cols = c_seq->j; 2087 vals = c_seq->a; 2088 for (i=0; i<cm; i++) { 2089 ncols = c_seq->i[i+1] - c_seq->i[i]; 2090 row = p->garray[i]; 2091 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2092 cols += ncols; vals += ncols; 2093 } 2094 2095 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2096 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2097 2098 ptap->reuse = MAT_REUSE_MATRIX; 2099 2100 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 2101 if (ptap->freestruct) { 2102 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 2103 } 2104 PetscFunctionReturn(0); 2105 } 2106 2107 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) 2108 { 2109 PetscErrorCode ierr; 2110 Mat_Product *product = C->product; 2111 Mat A=product->A,P=product->B; 2112 MatProductAlgorithm alg=product->alg; 2113 PetscReal fill=product->fill; 2114 PetscBool flg; 2115 2116 PetscFunctionBegin; 2117 /* scalable: do R=P^T locally, then C=R*A*P */ 2118 ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr); 2119 if (flg) { 2120 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C);CHKERRQ(ierr); 2121 C->ops->productnumeric = MatProductNumeric_PtAP; 2122 goto next; 2123 } 2124 2125 /* nonscalable: do R=P^T locally, then C=R*A*P */ 2126 ierr = PetscStrcmp(alg,"nonscalable",&flg);CHKERRQ(ierr); 2127 if (flg) { 2128 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); 2129 goto next; 2130 } 2131 2132 /* allatonce */ 2133 ierr = PetscStrcmp(alg,"allatonce",&flg);CHKERRQ(ierr); 2134 if (flg) { 2135 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr); 2136 goto next; 2137 } 2138 2139 /* allatonce_merged */ 2140 ierr = PetscStrcmp(alg,"allatonce_merged",&flg);CHKERRQ(ierr); 2141 if (flg) { 2142 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr); 2143 goto next; 2144 } 2145 2146 /* hypre */ 2147 #if defined(PETSC_HAVE_HYPRE) 2148 ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr); 2149 if (flg) { 2150 ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr); 2151 C->ops->productnumeric = MatProductNumeric_PtAP; 2152 PetscFunctionReturn(0); 2153 } 2154 #endif 2155 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 2156 2157 next: 2158 C->ops->productnumeric = MatProductNumeric_PtAP; 2159 { 2160 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2161 Mat_APMPI *ap = c->ap; 2162 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatFreeIntermediateDataStructures","Mat");CHKERRQ(ierr); 2163 ap->freestruct = PETSC_FALSE; 2164 ierr = PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);CHKERRQ(ierr); 2165 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2166 } 2167 PetscFunctionReturn(0); 2168 } 2169