1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmapiv.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer) 17 { 18 PetscErrorCode ierr; 19 PetscBool iascii; 20 PetscViewerFormat format; 21 Mat_APMPI *ptap; 22 23 PetscFunctionBegin; 24 MatCheckProduct(A,1); 25 ptap = (Mat_APMPI*)A->product->data; 26 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 27 if (iascii) { 28 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 29 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 30 if (ptap->algType == 0) { 31 ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr); 32 } else if (ptap->algType == 1) { 33 ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr); 34 } else if (ptap->algType == 2) { 35 ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 36 } else if (ptap->algType == 3) { 37 ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 38 } 39 } 40 } 41 PetscFunctionReturn(0); 42 } 43 44 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) 45 { 46 PetscErrorCode ierr; 47 Mat_APMPI *ptap = (Mat_APMPI*)data; 48 Mat_Merge_SeqsToMPI *merge; 49 50 PetscFunctionBegin; 51 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 52 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 53 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 54 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 55 ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */ 56 ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr); 57 ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr); 58 if (ptap->AP_loc) { /* used by alg_rap */ 59 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 60 ierr = PetscFree(ap->i);CHKERRQ(ierr); 61 ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr); 62 ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr); 63 } else { /* used by alg_ptap */ 64 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 65 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 66 } 67 ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr); 68 ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr); 69 if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);} 70 71 ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr); 72 73 merge = ptap->merge; 74 if (merge) { /* used by alg_ptap */ 75 ierr = PetscFree(merge->id_r);CHKERRQ(ierr); 76 ierr = PetscFree(merge->len_s);CHKERRQ(ierr); 77 ierr = PetscFree(merge->len_r);CHKERRQ(ierr); 78 ierr = PetscFree(merge->bi);CHKERRQ(ierr); 79 ierr = PetscFree(merge->bj);CHKERRQ(ierr); 80 ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr); 81 ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr); 82 ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr); 83 ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr); 84 ierr = PetscFree(merge->coi);CHKERRQ(ierr); 85 ierr = PetscFree(merge->coj);CHKERRQ(ierr); 86 ierr = PetscFree(merge->owners_co);CHKERRQ(ierr); 87 ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr); 88 ierr = PetscFree(ptap->merge);CHKERRQ(ierr); 89 } 90 ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr); 91 92 ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr); 93 ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr); 94 ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr); 95 ierr = PetscFree(ptap);CHKERRQ(ierr); 96 PetscFunctionReturn(0); 97 } 98 99 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 100 { 101 PetscErrorCode ierr; 102 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 103 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 104 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 105 Mat_APMPI *ptap; 106 Mat AP_loc,C_loc,C_oth; 107 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout; 108 PetscScalar *apa; 109 const PetscInt *cols; 110 const PetscScalar *vals; 111 112 PetscFunctionBegin; 113 MatCheckProduct(C,3); 114 ptap = (Mat_APMPI*)C->product->data; 115 if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 116 if (!ptap->AP_loc) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 117 118 ierr = MatZeroEntries(C);CHKERRQ(ierr); 119 120 /* 1) get R = Pd^T,Ro = Po^T */ 121 if (ptap->reuse == MAT_REUSE_MATRIX) { 122 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 123 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 124 } 125 126 /* 2) get AP_loc */ 127 AP_loc = ptap->AP_loc; 128 ap = (Mat_SeqAIJ*)AP_loc->data; 129 130 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 131 /*-----------------------------------------------------*/ 132 if (ptap->reuse == MAT_REUSE_MATRIX) { 133 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 134 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 135 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 136 } 137 138 /* 2-2) compute numeric A_loc*P - dominating part */ 139 /* ---------------------------------------------- */ 140 /* get data from symbolic products */ 141 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 142 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 143 144 api = ap->i; 145 apj = ap->j; 146 ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr); 147 for (i=0; i<am; i++) { 148 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 149 apnz = api[i+1] - api[i]; 150 apa = ap->a + api[i]; 151 ierr = PetscArrayzero(apa,apnz);CHKERRQ(ierr); 152 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 153 } 154 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr); 155 if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout); 156 157 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 158 /* Always use scalable version since we are in the MPI scalable version */ 159 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 160 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 161 162 C_loc = ptap->C_loc; 163 C_oth = ptap->C_oth; 164 165 /* add C_loc and Co to to C */ 166 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 167 168 /* C_loc -> C */ 169 cm = C_loc->rmap->N; 170 c_seq = (Mat_SeqAIJ*)C_loc->data; 171 cols = c_seq->j; 172 vals = c_seq->a; 173 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 174 175 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 176 /* when there are no off-processor parts. */ 177 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 178 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 179 /* a table, and other, more complex stuff has to be done. */ 180 if (C->assembled) { 181 C->was_assembled = PETSC_TRUE; 182 C->assembled = PETSC_FALSE; 183 } 184 if (C->was_assembled) { 185 for (i=0; i<cm; i++) { 186 ncols = c_seq->i[i+1] - c_seq->i[i]; 187 row = rstart + i; 188 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 189 cols += ncols; vals += ncols; 190 } 191 } else { 192 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 193 } 194 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 195 if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 196 197 /* Co -> C, off-processor part */ 198 cm = C_oth->rmap->N; 199 c_seq = (Mat_SeqAIJ*)C_oth->data; 200 cols = c_seq->j; 201 vals = c_seq->a; 202 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 203 for (i=0; i<cm; i++) { 204 ncols = c_seq->i[i+1] - c_seq->i[i]; 205 row = p->garray[i]; 206 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 207 cols += ncols; vals += ncols; 208 } 209 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 210 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 211 212 ptap->reuse = MAT_REUSE_MATRIX; 213 214 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 215 if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 216 PetscFunctionReturn(0); 217 } 218 219 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi) 220 { 221 PetscErrorCode ierr; 222 Mat_APMPI *ptap; 223 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 224 MPI_Comm comm; 225 PetscMPIInt size,rank; 226 Mat P_loc,P_oth; 227 PetscFreeSpaceList free_space=NULL,current_space=NULL; 228 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 229 PetscInt *lnk,i,k,pnz,row,nsend; 230 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,nrecv; 231 PETSC_UNUSED PetscMPIInt icompleted=0; 232 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 233 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 234 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 235 MPI_Request *swaits,*rwaits; 236 MPI_Status *sstatus,rstatus; 237 PetscLayout rowmap; 238 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 239 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 240 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout; 241 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 242 PetscScalar *apv; 243 PetscTable ta; 244 MatType mtype; 245 const char *prefix; 246 #if defined(PETSC_USE_INFO) 247 PetscReal apfill; 248 #endif 249 250 PetscFunctionBegin; 251 MatCheckProduct(Cmpi,4); 252 if (Cmpi->product->data) SETERRQ(PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 253 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 254 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 255 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 256 257 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 258 259 /* create symbolic parallel matrix Cmpi */ 260 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 261 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 262 263 /* create struct Mat_APMPI and attached it to C later */ 264 ierr = PetscNew(&ptap);CHKERRQ(ierr); 265 ptap->reuse = MAT_INITIAL_MATRIX; 266 ptap->algType = 0; 267 268 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 269 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr); 270 /* get P_loc by taking all local rows of P */ 271 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr); 272 273 ptap->P_loc = P_loc; 274 ptap->P_oth = P_oth; 275 276 /* (0) compute Rd = Pd^T, Ro = Po^T */ 277 /* --------------------------------- */ 278 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 279 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 280 281 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 282 /* ----------------------------------------------------------------- */ 283 p_loc = (Mat_SeqAIJ*)P_loc->data; 284 if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data; 285 286 /* create and initialize a linked list */ 287 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 288 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 289 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 290 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 291 292 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 293 294 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 295 if (ao) { 296 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 297 } else { 298 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 299 } 300 current_space = free_space; 301 nspacedouble = 0; 302 303 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 304 api[0] = 0; 305 for (i=0; i<am; i++) { 306 /* diagonal portion: Ad[i,:]*P */ 307 ai = ad->i; pi = p_loc->i; 308 nzi = ai[i+1] - ai[i]; 309 aj = ad->j + ai[i]; 310 for (j=0; j<nzi; j++) { 311 row = aj[j]; 312 pnz = pi[row+1] - pi[row]; 313 Jptr = p_loc->j + pi[row]; 314 /* add non-zero cols of P into the sorted linked list lnk */ 315 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 316 } 317 /* off-diagonal portion: Ao[i,:]*P */ 318 if (ao) { 319 ai = ao->i; pi = p_oth->i; 320 nzi = ai[i+1] - ai[i]; 321 aj = ao->j + ai[i]; 322 for (j=0; j<nzi; j++) { 323 row = aj[j]; 324 pnz = pi[row+1] - pi[row]; 325 Jptr = p_oth->j + pi[row]; 326 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 327 } 328 } 329 apnz = lnk[0]; 330 api[i+1] = api[i] + apnz; 331 332 /* if free space is not available, double the total space in the list */ 333 if (current_space->local_remaining<apnz) { 334 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 335 nspacedouble++; 336 } 337 338 /* Copy data into free space, then initialize lnk */ 339 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 340 341 current_space->array += apnz; 342 current_space->local_used += apnz; 343 current_space->local_remaining -= apnz; 344 } 345 /* Allocate space for apj and apv, initialize apj, and */ 346 /* destroy list of free space and other temporary array(s) */ 347 ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 348 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 349 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 350 351 /* Create AP_loc for reuse */ 352 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 353 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr); 354 355 #if defined(PETSC_USE_INFO) 356 if (ao) { 357 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 358 } else { 359 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 360 } 361 ptap->AP_loc->info.mallocs = nspacedouble; 362 ptap->AP_loc->info.fill_ratio_given = fill; 363 ptap->AP_loc->info.fill_ratio_needed = apfill; 364 365 if (api[am]) { 366 ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 367 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 368 } else { 369 ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 370 } 371 #endif 372 373 /* (2-1) compute symbolic Co = Ro*AP_loc */ 374 /* -------------------------------------- */ 375 ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr); 376 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 377 ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr); 378 ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_");CHKERRQ(ierr); 379 380 ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr); 381 ierr = MatProductSetAlgorithm(ptap->C_oth,"sorted");CHKERRQ(ierr); 382 ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr); 383 ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr); 384 ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr); 385 386 /* (3) send coj of C_oth to other processors */ 387 /* ------------------------------------------ */ 388 /* determine row ownership */ 389 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 390 rowmap->n = pn; 391 rowmap->bs = 1; 392 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 393 owners = rowmap->range; 394 395 /* determine the number of messages to send, their lengths */ 396 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 397 ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr); 398 ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr); 399 400 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 401 coi = c_oth->i; coj = c_oth->j; 402 con = ptap->C_oth->rmap->n; 403 proc = 0; 404 ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr); 405 for (i=0; i<con; i++) { 406 while (prmap[i] >= owners[proc+1]) proc++; 407 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 408 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 409 } 410 411 len = 0; /* max length of buf_si[], see (4) */ 412 owners_co[0] = 0; 413 nsend = 0; 414 for (proc=0; proc<size; proc++) { 415 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 416 if (len_s[proc]) { 417 nsend++; 418 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 419 len += len_si[proc]; 420 } 421 } 422 423 /* determine the number and length of messages to receive for coi and coj */ 424 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 425 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 426 427 /* post the Irecv and Isend of coj */ 428 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 429 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 430 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 431 for (proc=0, k=0; proc<size; proc++) { 432 if (!len_s[proc]) continue; 433 i = owners_co[proc]; 434 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRMPI(ierr); 435 k++; 436 } 437 438 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 439 /* ---------------------------------------- */ 440 ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr); 441 ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr); 442 ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr); 443 ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr); 444 445 ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr); 446 ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_");CHKERRQ(ierr); 447 448 ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr); 449 ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr); 450 451 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 452 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr); 453 454 /* receives coj are complete */ 455 for (i=0; i<nrecv; i++) { 456 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr); 457 } 458 ierr = PetscFree(rwaits);CHKERRQ(ierr); 459 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);} 460 461 /* add received column indices into ta to update Crmax */ 462 for (k=0; k<nrecv; k++) {/* k-th received message */ 463 Jptr = buf_rj[k]; 464 for (j=0; j<len_r[k]; j++) { 465 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 466 } 467 } 468 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 469 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 470 471 /* (4) send and recv coi */ 472 /*-----------------------*/ 473 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 474 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 475 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 476 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 477 for (proc=0,k=0; proc<size; proc++) { 478 if (!len_s[proc]) continue; 479 /* form outgoing message for i-structure: 480 buf_si[0]: nrows to be sent 481 [1:nrows]: row index (global) 482 [nrows+1:2*nrows+1]: i-structure index 483 */ 484 /*-------------------------------------------*/ 485 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 486 buf_si_i = buf_si + nrows+1; 487 buf_si[0] = nrows; 488 buf_si_i[0] = 0; 489 nrows = 0; 490 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 491 nzi = coi[i+1] - coi[i]; 492 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 493 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 494 nrows++; 495 } 496 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRMPI(ierr); 497 k++; 498 buf_si += len_si[proc]; 499 } 500 for (i=0; i<nrecv; i++) { 501 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr); 502 } 503 ierr = PetscFree(rwaits);CHKERRQ(ierr); 504 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);} 505 506 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 507 ierr = PetscFree(len_ri);CHKERRQ(ierr); 508 ierr = PetscFree(swaits);CHKERRQ(ierr); 509 ierr = PetscFree(buf_s);CHKERRQ(ierr); 510 511 /* (5) compute the local portion of Cmpi */ 512 /* ------------------------------------------ */ 513 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 514 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 515 current_space = free_space; 516 517 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 518 for (k=0; k<nrecv; k++) { 519 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 520 nrows = *buf_ri_k[k]; 521 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 522 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 523 } 524 525 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 526 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 527 for (i=0; i<pn; i++) { 528 /* add C_loc into Cmpi */ 529 nzi = c_loc->i[i+1] - c_loc->i[i]; 530 Jptr = c_loc->j + c_loc->i[i]; 531 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 532 533 /* add received col data into lnk */ 534 for (k=0; k<nrecv; k++) { /* k-th received message */ 535 if (i == *nextrow[k]) { /* i-th row */ 536 nzi = *(nextci[k]+1) - *nextci[k]; 537 Jptr = buf_rj[k] + *nextci[k]; 538 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 539 nextrow[k]++; nextci[k]++; 540 } 541 } 542 nzi = lnk[0]; 543 544 /* copy data into free space, then initialize lnk */ 545 ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr); 546 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 547 } 548 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 549 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 550 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 551 552 /* local sizes and preallocation */ 553 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 554 if (P->cmap->bs > 0) { 555 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 556 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 557 } 558 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 559 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 560 561 /* members in merge */ 562 ierr = PetscFree(id_r);CHKERRQ(ierr); 563 ierr = PetscFree(len_r);CHKERRQ(ierr); 564 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 565 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 566 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 567 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 568 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 569 570 nout = 0; 571 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr); 572 if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout); 573 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr); 574 if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout); 575 576 /* attach the supporting struct to Cmpi for reuse */ 577 Cmpi->product->data = ptap; 578 Cmpi->product->view = MatView_MPIAIJ_PtAP; 579 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 580 581 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 582 Cmpi->assembled = PETSC_FALSE; 583 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 584 PetscFunctionReturn(0); 585 } 586 587 PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht) 588 { 589 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 590 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 591 PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k; 592 PetscInt pcstart,pcend,column,offset; 593 PetscErrorCode ierr; 594 595 PetscFunctionBegin; 596 pcstart = P->cmap->rstart; 597 pcstart *= dof; 598 pcend = P->cmap->rend; 599 pcend *= dof; 600 /* diagonal portion: Ad[i,:]*P */ 601 ai = ad->i; 602 nzi = ai[i+1] - ai[i]; 603 aj = ad->j + ai[i]; 604 for (j=0; j<nzi; j++) { 605 row = aj[j]; 606 offset = row%dof; 607 row /= dof; 608 nzpi = pd->i[row+1] - pd->i[row]; 609 pj = pd->j + pd->i[row]; 610 for (k=0; k<nzpi; k++) { 611 ierr = PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);CHKERRQ(ierr); 612 } 613 } 614 /* off diag P */ 615 for (j=0; j<nzi; j++) { 616 row = aj[j]; 617 offset = row%dof; 618 row /= dof; 619 nzpi = po->i[row+1] - po->i[row]; 620 pj = po->j + po->i[row]; 621 for (k=0; k<nzpi; k++) { 622 ierr = PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);CHKERRQ(ierr); 623 } 624 } 625 626 /* off diagonal part: Ao[i, :]*P_oth */ 627 if (ao) { 628 ai = ao->i; 629 pi = p_oth->i; 630 nzi = ai[i+1] - ai[i]; 631 aj = ao->j + ai[i]; 632 for (j=0; j<nzi; j++) { 633 row = aj[j]; 634 offset = a->garray[row]%dof; 635 row = map[row]; 636 pnz = pi[row+1] - pi[row]; 637 p_othcols = p_oth->j + pi[row]; 638 for (col=0; col<pnz; col++) { 639 column = p_othcols[col] * dof + offset; 640 if (column>=pcstart && column<pcend) { 641 ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr); 642 } else { 643 ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr); 644 } 645 } 646 } 647 } /* end if (ao) */ 648 PetscFunctionReturn(0); 649 } 650 651 PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap) 652 { 653 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 654 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 655 PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset; 656 PetscScalar ra,*aa,*pa; 657 PetscErrorCode ierr; 658 659 PetscFunctionBegin; 660 pcstart = P->cmap->rstart; 661 pcstart *= dof; 662 663 /* diagonal portion: Ad[i,:]*P */ 664 ai = ad->i; 665 nzi = ai[i+1] - ai[i]; 666 aj = ad->j + ai[i]; 667 aa = ad->a + ai[i]; 668 for (j=0; j<nzi; j++) { 669 ra = aa[j]; 670 row = aj[j]; 671 offset = row%dof; 672 row /= dof; 673 nzpi = pd->i[row+1] - pd->i[row]; 674 pj = pd->j + pd->i[row]; 675 pa = pd->a + pd->i[row]; 676 for (k=0; k<nzpi; k++) { 677 ierr = PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);CHKERRQ(ierr); 678 } 679 ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr); 680 } 681 for (j=0; j<nzi; j++) { 682 ra = aa[j]; 683 row = aj[j]; 684 offset = row%dof; 685 row /= dof; 686 nzpi = po->i[row+1] - po->i[row]; 687 pj = po->j + po->i[row]; 688 pa = po->a + po->i[row]; 689 for (k=0; k<nzpi; k++) { 690 ierr = PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);CHKERRQ(ierr); 691 } 692 ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr); 693 } 694 695 /* off diagonal part: Ao[i, :]*P_oth */ 696 if (ao) { 697 ai = ao->i; 698 pi = p_oth->i; 699 nzi = ai[i+1] - ai[i]; 700 aj = ao->j + ai[i]; 701 aa = ao->a + ai[i]; 702 for (j=0; j<nzi; j++) { 703 row = aj[j]; 704 offset = a->garray[row]%dof; 705 row = map[row]; 706 ra = aa[j]; 707 pnz = pi[row+1] - pi[row]; 708 p_othcols = p_oth->j + pi[row]; 709 pa = p_oth->a + pi[row]; 710 for (col=0; col<pnz; col++) { 711 ierr = PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);CHKERRQ(ierr); 712 } 713 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 714 } 715 } /* end if (ao) */ 716 717 PetscFunctionReturn(0); 718 } 719 720 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*); 721 722 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C) 723 { 724 PetscErrorCode ierr; 725 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 726 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 727 Mat_APMPI *ptap; 728 PetscHMapIV hmap; 729 PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc; 730 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 731 PetscInt offset,ii,pocol; 732 const PetscInt *mappingindices; 733 IS map; 734 735 PetscFunctionBegin; 736 MatCheckProduct(C,4); 737 ptap = (Mat_APMPI*)C->product->data; 738 if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 739 if (!ptap->P_oth) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 740 741 ierr = MatZeroEntries(C);CHKERRQ(ierr); 742 743 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 744 /*-----------------------------------------------------*/ 745 if (ptap->reuse == MAT_REUSE_MATRIX) { 746 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 747 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 748 } 749 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 750 751 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 752 pon *= dof; 753 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 754 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 755 cmaxr = 0; 756 for (i=0; i<pon; i++) { 757 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 758 } 759 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 760 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 761 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 762 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 763 for (i=0; i<am && pon; i++) { 764 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 765 offset = i%dof; 766 ii = i/dof; 767 nzi = po->i[ii+1] - po->i[ii]; 768 if (!nzi) continue; 769 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 770 voff = 0; 771 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 772 if (!voff) continue; 773 774 /* Form C(ii, :) */ 775 poj = po->j + po->i[ii]; 776 poa = po->a + po->i[ii]; 777 for (j=0; j<nzi; j++) { 778 pocol = poj[j]*dof+offset; 779 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 780 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 781 for (jj=0; jj<voff; jj++) { 782 apvaluestmp[jj] = apvalues[jj]*poa[j]; 783 /*If the row is empty */ 784 if (!c_rmtc[pocol]) { 785 c_rmtjj[jj] = apindices[jj]; 786 c_rmtaa[jj] = apvaluestmp[jj]; 787 c_rmtc[pocol]++; 788 } else { 789 ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr); 790 if (loc>=0){ /* hit */ 791 c_rmtaa[loc] += apvaluestmp[jj]; 792 ierr = PetscLogFlops(1.0);CHKERRQ(ierr); 793 } else { /* new element */ 794 loc = -(loc+1); 795 /* Move data backward */ 796 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 797 c_rmtjj[kk] = c_rmtjj[kk-1]; 798 c_rmtaa[kk] = c_rmtaa[kk-1]; 799 }/* End kk */ 800 c_rmtjj[loc] = apindices[jj]; 801 c_rmtaa[loc] = apvaluestmp[jj]; 802 c_rmtc[pocol]++; 803 } 804 } 805 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 806 } /* End jj */ 807 } /* End j */ 808 } /* End i */ 809 810 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 811 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 812 813 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 814 pn *= dof; 815 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 816 817 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 818 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr); 819 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 820 pcstart = pcstart*dof; 821 pcend = pcend*dof; 822 cd = (Mat_SeqAIJ*)(c->A)->data; 823 co = (Mat_SeqAIJ*)(c->B)->data; 824 825 cmaxr = 0; 826 for (i=0; i<pn; i++) { 827 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 828 } 829 ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr); 830 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 831 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 832 for (i=0; i<am && pn; i++) { 833 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 834 offset = i%dof; 835 ii = i/dof; 836 nzi = pd->i[ii+1] - pd->i[ii]; 837 if (!nzi) continue; 838 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 839 voff = 0; 840 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 841 if (!voff) continue; 842 /* Form C(ii, :) */ 843 pdj = pd->j + pd->i[ii]; 844 pda = pd->a + pd->i[ii]; 845 for (j=0; j<nzi; j++) { 846 row = pcstart + pdj[j] * dof + offset; 847 for (jj=0; jj<voff; jj++) { 848 apvaluestmp[jj] = apvalues[jj]*pda[j]; 849 } 850 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 851 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 852 } 853 } 854 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 855 ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr); 856 ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr); 857 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 858 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 859 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr); 860 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 861 862 /* Add contributions from remote */ 863 for (i = 0; i < pn; i++) { 864 row = i + pcstart; 865 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 866 } 867 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 868 869 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 870 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 871 872 ptap->reuse = MAT_REUSE_MATRIX; 873 PetscFunctionReturn(0); 874 } 875 876 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C) 877 { 878 PetscErrorCode ierr; 879 880 PetscFunctionBegin; 881 882 ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);CHKERRQ(ierr); 883 PetscFunctionReturn(0); 884 } 885 886 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C) 887 { 888 PetscErrorCode ierr; 889 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 890 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 891 Mat_APMPI *ptap; 892 PetscHMapIV hmap; 893 PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc; 894 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 895 PetscInt offset,ii,pocol; 896 const PetscInt *mappingindices; 897 IS map; 898 899 PetscFunctionBegin; 900 MatCheckProduct(C,4); 901 ptap = (Mat_APMPI*)C->product->data; 902 if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 903 if (!ptap->P_oth) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 904 905 ierr = MatZeroEntries(C);CHKERRQ(ierr); 906 907 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 908 /*-----------------------------------------------------*/ 909 if (ptap->reuse == MAT_REUSE_MATRIX) { 910 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 911 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 912 } 913 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 914 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 915 pon *= dof; 916 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 917 pn *= dof; 918 919 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 920 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 921 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 922 pcstart *= dof; 923 pcend *= dof; 924 cmaxr = 0; 925 for (i=0; i<pon; i++) { 926 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 927 } 928 cd = (Mat_SeqAIJ*)(c->A)->data; 929 co = (Mat_SeqAIJ*)(c->B)->data; 930 for (i=0; i<pn; i++) { 931 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 932 } 933 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 934 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 935 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 936 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 937 for (i=0; i<am && (pon || pn); i++) { 938 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 939 offset = i%dof; 940 ii = i/dof; 941 nzi = po->i[ii+1] - po->i[ii]; 942 dnzi = pd->i[ii+1] - pd->i[ii]; 943 if (!nzi && !dnzi) continue; 944 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 945 voff = 0; 946 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 947 if (!voff) continue; 948 949 /* Form remote C(ii, :) */ 950 poj = po->j + po->i[ii]; 951 poa = po->a + po->i[ii]; 952 for (j=0; j<nzi; j++) { 953 pocol = poj[j]*dof+offset; 954 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 955 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 956 for (jj=0; jj<voff; jj++) { 957 apvaluestmp[jj] = apvalues[jj]*poa[j]; 958 /*If the row is empty */ 959 if (!c_rmtc[pocol]) { 960 c_rmtjj[jj] = apindices[jj]; 961 c_rmtaa[jj] = apvaluestmp[jj]; 962 c_rmtc[pocol]++; 963 } else { 964 ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr); 965 if (loc>=0){ /* hit */ 966 c_rmtaa[loc] += apvaluestmp[jj]; 967 ierr = PetscLogFlops(1.0);CHKERRQ(ierr); 968 } else { /* new element */ 969 loc = -(loc+1); 970 /* Move data backward */ 971 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 972 c_rmtjj[kk] = c_rmtjj[kk-1]; 973 c_rmtaa[kk] = c_rmtaa[kk-1]; 974 }/* End kk */ 975 c_rmtjj[loc] = apindices[jj]; 976 c_rmtaa[loc] = apvaluestmp[jj]; 977 c_rmtc[pocol]++; 978 } 979 } 980 } /* End jj */ 981 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 982 } /* End j */ 983 984 /* Form local C(ii, :) */ 985 pdj = pd->j + pd->i[ii]; 986 pda = pd->a + pd->i[ii]; 987 for (j=0; j<dnzi; j++) { 988 row = pcstart + pdj[j] * dof + offset; 989 for (jj=0; jj<voff; jj++) { 990 apvaluestmp[jj] = apvalues[jj]*pda[j]; 991 }/* End kk */ 992 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 993 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 994 }/* End j */ 995 } /* End i */ 996 997 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 998 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 999 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 1000 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 1001 1002 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1003 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr); 1004 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1005 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr); 1006 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 1007 1008 /* Add contributions from remote */ 1009 for (i = 0; i < pn; i++) { 1010 row = i + pcstart; 1011 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 1012 } 1013 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 1014 1015 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1016 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1017 1018 ptap->reuse = MAT_REUSE_MATRIX; 1019 PetscFunctionReturn(0); 1020 } 1021 1022 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C) 1023 { 1024 PetscErrorCode ierr; 1025 1026 PetscFunctionBegin; 1027 1028 ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);CHKERRQ(ierr); 1029 PetscFunctionReturn(0); 1030 } 1031 1032 /* TODO: move algorithm selection to MatProductSetFromOptions */ 1033 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi) 1034 { 1035 Mat_APMPI *ptap; 1036 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data; 1037 MPI_Comm comm; 1038 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1039 MatType mtype; 1040 PetscSF sf; 1041 PetscSFNode *iremote; 1042 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1043 const PetscInt *rootdegrees; 1044 PetscHSetI ht,oht,*hta,*hto; 1045 PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1046 PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1047 PetscInt nalg=2,alg=0,offset,ii; 1048 PetscMPIInt owner; 1049 const PetscInt *mappingindices; 1050 PetscBool flg; 1051 const char *algTypes[2] = {"overlapping","merged"}; 1052 IS map; 1053 PetscErrorCode ierr; 1054 1055 PetscFunctionBegin; 1056 MatCheckProduct(Cmpi,5); 1057 if (Cmpi->product->data) SETERRQ(PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 1058 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1059 1060 /* Create symbolic parallel matrix Cmpi */ 1061 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1062 pn *= dof; 1063 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1064 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1065 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1066 1067 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1068 ptap->reuse = MAT_INITIAL_MATRIX; 1069 ptap->algType = 2; 1070 1071 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1072 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 1073 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 1074 /* This equals to the number of offdiag columns in P */ 1075 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1076 pon *= dof; 1077 /* offsets */ 1078 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1079 /* The number of columns we will send to remote ranks */ 1080 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1081 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1082 for (i=0; i<pon; i++) { 1083 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1084 } 1085 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1086 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1087 /* Create hash table to merge all columns for C(i, :) */ 1088 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1089 1090 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 1091 ptap->c_rmti[0] = 0; 1092 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1093 for (i=0; i<am && pon; i++) { 1094 /* Form one row of AP */ 1095 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1096 offset = i%dof; 1097 ii = i/dof; 1098 /* If the off diag is empty, we should not do any calculation */ 1099 nzi = po->i[ii+1] - po->i[ii]; 1100 if (!nzi) continue; 1101 1102 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);CHKERRQ(ierr); 1103 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1104 /* If AP is empty, just continue */ 1105 if (!htsize) continue; 1106 /* Form C(ii, :) */ 1107 poj = po->j + po->i[ii]; 1108 for (j=0; j<nzi; j++) { 1109 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr); 1110 } 1111 } 1112 1113 for (i=0; i<pon; i++) { 1114 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1115 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1116 c_rmtc[i] = htsize; 1117 } 1118 1119 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1120 1121 for (i=0; i<pon; i++) { 1122 off = 0; 1123 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1124 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1125 } 1126 ierr = PetscFree(hta);CHKERRQ(ierr); 1127 1128 ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr); 1129 for (i=0; i<pon; i++) { 1130 owner = 0; lidx = 0; 1131 offset = i%dof; 1132 ii = i/dof; 1133 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr); 1134 iremote[i].index = lidx*dof + offset; 1135 iremote[i].rank = owner; 1136 } 1137 1138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1139 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1140 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1141 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1143 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1144 /* How many neighbors have contributions to my rows? */ 1145 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1146 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1147 rootspacesize = 0; 1148 for (i = 0; i < pn; i++) { 1149 rootspacesize += rootdegrees[i]; 1150 } 1151 ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1152 ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1153 /* Get information from leaves 1154 * Number of columns other people contribute to my rows 1155 * */ 1156 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1157 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1158 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1159 ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1160 /* The number of columns is received for each row */ 1161 ptap->c_othi[0] = 0; 1162 rootspacesize = 0; 1163 rootspaceoffsets[0] = 0; 1164 for (i = 0; i < pn; i++) { 1165 rcvncols = 0; 1166 for (j = 0; j<rootdegrees[i]; j++) { 1167 rcvncols += rootspace[rootspacesize]; 1168 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1169 rootspacesize++; 1170 } 1171 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1172 } 1173 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1174 1175 ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1176 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1177 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1178 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1179 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1180 1181 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1182 nleaves = 0; 1183 for (i = 0; i<pon; i++) { 1184 owner = 0; 1185 ii = i/dof; 1186 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr); 1187 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1188 for (j=0; j<sendncols; j++) { 1189 iremote[nleaves].rank = owner; 1190 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1191 } 1192 } 1193 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1194 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1195 1196 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1197 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1198 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1199 /* One to one map */ 1200 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1201 1202 ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1203 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1204 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1205 pcstart *= dof; 1206 pcend *= dof; 1207 ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr); 1208 for (i=0; i<pn; i++) { 1209 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1210 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1211 } 1212 /* Work on local part */ 1213 /* 4) Pass 1: Estimate memory for C_loc */ 1214 for (i=0; i<am && pn; i++) { 1215 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1216 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1217 offset = i%dof; 1218 ii = i/dof; 1219 nzi = pd->i[ii+1] - pd->i[ii]; 1220 if (!nzi) continue; 1221 1222 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr); 1223 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1224 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1225 if (!(htsize+htosize)) continue; 1226 /* Form C(ii, :) */ 1227 pdj = pd->j + pd->i[ii]; 1228 for (j=0; j<nzi; j++) { 1229 ierr = PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);CHKERRQ(ierr); 1230 ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr); 1231 } 1232 } 1233 1234 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1235 1236 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1237 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1238 1239 /* Get remote data */ 1240 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1241 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1242 1243 for (i = 0; i < pn; i++) { 1244 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1245 rdj = c_othj + ptap->c_othi[i]; 1246 for (j = 0; j < nzi; j++) { 1247 col = rdj[j]; 1248 /* diag part */ 1249 if (col>=pcstart && col<pcend) { 1250 ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr); 1251 } else { /* off diag */ 1252 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1253 } 1254 } 1255 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1256 dnz[i] = htsize; 1257 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1258 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1259 onz[i] = htsize; 1260 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1261 } 1262 1263 ierr = PetscFree2(hta,hto);CHKERRQ(ierr); 1264 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1265 1266 /* local sizes and preallocation */ 1267 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1268 ierr = MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr); 1269 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1270 ierr = MatSetUp(Cmpi);CHKERRQ(ierr); 1271 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1272 1273 /* attach the supporting struct to Cmpi for reuse */ 1274 Cmpi->product->data = ptap; 1275 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1276 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1277 1278 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1279 Cmpi->assembled = PETSC_FALSE; 1280 /* pick an algorithm */ 1281 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1282 alg = 0; 1283 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1284 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1285 switch (alg) { 1286 case 0: 1287 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1288 break; 1289 case 1: 1290 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1291 break; 1292 default: 1293 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1294 } 1295 PetscFunctionReturn(0); 1296 } 1297 1298 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C) 1299 { 1300 PetscErrorCode ierr; 1301 1302 PetscFunctionBegin; 1303 ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);CHKERRQ(ierr); 1304 PetscFunctionReturn(0); 1305 } 1306 1307 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi) 1308 { 1309 Mat_APMPI *ptap; 1310 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data; 1311 MPI_Comm comm; 1312 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1313 MatType mtype; 1314 PetscSF sf; 1315 PetscSFNode *iremote; 1316 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1317 const PetscInt *rootdegrees; 1318 PetscHSetI ht,oht,*hta,*hto,*htd; 1319 PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1320 PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1321 PetscInt nalg=2,alg=0,offset,ii; 1322 PetscMPIInt owner; 1323 PetscBool flg; 1324 const char *algTypes[2] = {"merged","overlapping"}; 1325 const PetscInt *mappingindices; 1326 IS map; 1327 PetscErrorCode ierr; 1328 1329 PetscFunctionBegin; 1330 MatCheckProduct(Cmpi,5); 1331 if (Cmpi->product->data) SETERRQ(PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 1332 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1333 1334 /* Create symbolic parallel matrix Cmpi */ 1335 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1336 pn *= dof; 1337 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1338 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1339 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1340 1341 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1342 ptap->reuse = MAT_INITIAL_MATRIX; 1343 ptap->algType = 3; 1344 1345 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1346 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 1347 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 1348 1349 /* This equals to the number of offdiag columns in P */ 1350 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1351 pon *= dof; 1352 /* offsets */ 1353 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1354 /* The number of columns we will send to remote ranks */ 1355 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1356 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1357 for (i=0; i<pon; i++) { 1358 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1359 } 1360 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1361 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1362 /* Create hash table to merge all columns for C(i, :) */ 1363 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1364 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1365 ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr); 1366 for (i=0; i<pn; i++) { 1367 ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr); 1368 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1369 } 1370 1371 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 1372 ptap->c_rmti[0] = 0; 1373 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1374 for (i=0; i<am && (pon || pn); i++) { 1375 /* Form one row of AP */ 1376 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1377 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1378 offset = i%dof; 1379 ii = i/dof; 1380 /* If the off diag is empty, we should not do any calculation */ 1381 nzi = po->i[ii+1] - po->i[ii]; 1382 dnzi = pd->i[ii+1] - pd->i[ii]; 1383 if (!nzi && !dnzi) continue; 1384 1385 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr); 1386 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1387 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1388 /* If AP is empty, just continue */ 1389 if (!(htsize+htosize)) continue; 1390 1391 /* Form remote C(ii, :) */ 1392 poj = po->j + po->i[ii]; 1393 for (j=0; j<nzi; j++) { 1394 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr); 1395 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);CHKERRQ(ierr); 1396 } 1397 1398 /* Form local C(ii, :) */ 1399 pdj = pd->j + pd->i[ii]; 1400 for (j=0; j<dnzi; j++) { 1401 ierr = PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);CHKERRQ(ierr); 1402 ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr); 1403 } 1404 } 1405 1406 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1407 1408 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1409 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1410 1411 for (i=0; i<pon; i++) { 1412 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1413 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1414 c_rmtc[i] = htsize; 1415 } 1416 1417 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1418 1419 for (i=0; i<pon; i++) { 1420 off = 0; 1421 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1422 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1423 } 1424 ierr = PetscFree(hta);CHKERRQ(ierr); 1425 1426 ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr); 1427 for (i=0; i<pon; i++) { 1428 owner = 0; lidx = 0; 1429 offset = i%dof; 1430 ii = i/dof; 1431 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr); 1432 iremote[i].index = lidx*dof+offset; 1433 iremote[i].rank = owner; 1434 } 1435 1436 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1437 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1438 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1439 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1440 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1441 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1442 /* How many neighbors have contributions to my rows? */ 1443 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1444 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1445 rootspacesize = 0; 1446 for (i = 0; i < pn; i++) { 1447 rootspacesize += rootdegrees[i]; 1448 } 1449 ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1450 ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1451 /* Get information from leaves 1452 * Number of columns other people contribute to my rows 1453 * */ 1454 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1455 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1456 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1457 ierr = PetscMalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1458 /* The number of columns is received for each row */ 1459 ptap->c_othi[0] = 0; 1460 rootspacesize = 0; 1461 rootspaceoffsets[0] = 0; 1462 for (i = 0; i < pn; i++) { 1463 rcvncols = 0; 1464 for (j = 0; j<rootdegrees[i]; j++) { 1465 rcvncols += rootspace[rootspacesize]; 1466 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1467 rootspacesize++; 1468 } 1469 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1470 } 1471 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1472 1473 ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1474 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1475 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1476 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1477 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1478 1479 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1480 nleaves = 0; 1481 for (i = 0; i<pon; i++) { 1482 owner = 0; 1483 ii = i/dof; 1484 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr); 1485 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1486 for (j=0; j<sendncols; j++) { 1487 iremote[nleaves].rank = owner; 1488 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1489 } 1490 } 1491 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1492 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1493 1494 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1495 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1496 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1497 /* One to one map */ 1498 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1499 /* Get remote data */ 1500 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1501 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1502 ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1503 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1504 pcstart *= dof; 1505 pcend *= dof; 1506 for (i = 0; i < pn; i++) { 1507 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1508 rdj = c_othj + ptap->c_othi[i]; 1509 for (j = 0; j < nzi; j++) { 1510 col = rdj[j]; 1511 /* diag part */ 1512 if (col>=pcstart && col<pcend) { 1513 ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr); 1514 } else { /* off diag */ 1515 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1516 } 1517 } 1518 ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr); 1519 dnz[i] = htsize; 1520 ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr); 1521 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1522 onz[i] = htsize; 1523 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1524 } 1525 1526 ierr = PetscFree2(htd,hto);CHKERRQ(ierr); 1527 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1528 1529 /* local sizes and preallocation */ 1530 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1531 ierr = MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr); 1532 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1533 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1534 1535 /* attach the supporting struct to Cmpi for reuse */ 1536 Cmpi->product->data = ptap; 1537 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1538 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1539 1540 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1541 Cmpi->assembled = PETSC_FALSE; 1542 /* pick an algorithm */ 1543 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1544 alg = 0; 1545 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1546 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1547 switch (alg) { 1548 case 0: 1549 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1550 break; 1551 case 1: 1552 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1553 break; 1554 default: 1555 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1556 } 1557 PetscFunctionReturn(0); 1558 } 1559 1560 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C) 1561 { 1562 PetscErrorCode ierr; 1563 1564 PetscFunctionBegin; 1565 ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);CHKERRQ(ierr); 1566 PetscFunctionReturn(0); 1567 } 1568 1569 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi) 1570 { 1571 PetscErrorCode ierr; 1572 Mat_APMPI *ptap; 1573 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 1574 MPI_Comm comm; 1575 PetscMPIInt size,rank; 1576 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1577 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 1578 PetscInt *lnk,i,k,pnz,row,nsend; 1579 PetscBT lnkbt; 1580 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,nrecv; 1581 PETSC_UNUSED PetscMPIInt icompleted=0; 1582 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1583 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 1584 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1585 MPI_Request *swaits,*rwaits; 1586 MPI_Status *sstatus,rstatus; 1587 PetscLayout rowmap; 1588 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1589 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 1590 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 1591 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 1592 PetscScalar *apv; 1593 PetscTable ta; 1594 MatType mtype; 1595 const char *prefix; 1596 #if defined(PETSC_USE_INFO) 1597 PetscReal apfill; 1598 #endif 1599 1600 PetscFunctionBegin; 1601 MatCheckProduct(Cmpi,4); 1602 if (Cmpi->product->data) SETERRQ(PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 1603 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1604 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1605 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 1606 1607 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 1608 1609 /* create symbolic parallel matrix Cmpi */ 1610 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1611 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1612 1613 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1614 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1615 1616 /* create struct Mat_APMPI and attached it to C later */ 1617 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1618 ptap->reuse = MAT_INITIAL_MATRIX; 1619 ptap->algType = 1; 1620 1621 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1622 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1623 /* get P_loc by taking all local rows of P */ 1624 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1625 1626 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1627 /* --------------------------------- */ 1628 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1629 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1630 1631 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1632 /* ----------------------------------------------------------------- */ 1633 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1634 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1635 1636 /* create and initialize a linked list */ 1637 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 1638 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 1639 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 1640 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 1641 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1642 1643 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1644 1645 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1646 if (ao) { 1647 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 1648 } else { 1649 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 1650 } 1651 current_space = free_space; 1652 nspacedouble = 0; 1653 1654 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 1655 api[0] = 0; 1656 for (i=0; i<am; i++) { 1657 /* diagonal portion: Ad[i,:]*P */ 1658 ai = ad->i; pi = p_loc->i; 1659 nzi = ai[i+1] - ai[i]; 1660 aj = ad->j + ai[i]; 1661 for (j=0; j<nzi; j++) { 1662 row = aj[j]; 1663 pnz = pi[row+1] - pi[row]; 1664 Jptr = p_loc->j + pi[row]; 1665 /* add non-zero cols of P into the sorted linked list lnk */ 1666 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1667 } 1668 /* off-diagonal portion: Ao[i,:]*P */ 1669 if (ao) { 1670 ai = ao->i; pi = p_oth->i; 1671 nzi = ai[i+1] - ai[i]; 1672 aj = ao->j + ai[i]; 1673 for (j=0; j<nzi; j++) { 1674 row = aj[j]; 1675 pnz = pi[row+1] - pi[row]; 1676 Jptr = p_oth->j + pi[row]; 1677 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1678 } 1679 } 1680 apnz = lnk[0]; 1681 api[i+1] = api[i] + apnz; 1682 if (ap_rmax < apnz) ap_rmax = apnz; 1683 1684 /* if free space is not available, double the total space in the list */ 1685 if (current_space->local_remaining<apnz) { 1686 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1687 nspacedouble++; 1688 } 1689 1690 /* Copy data into free space, then initialize lnk */ 1691 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1692 1693 current_space->array += apnz; 1694 current_space->local_used += apnz; 1695 current_space->local_remaining -= apnz; 1696 } 1697 /* Allocate space for apj and apv, initialize apj, and */ 1698 /* destroy list of free space and other temporary array(s) */ 1699 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 1700 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 1701 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1702 1703 /* Create AP_loc for reuse */ 1704 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 1705 ierr = MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name);CHKERRQ(ierr); 1706 #if defined(PETSC_USE_INFO) 1707 if (ao) { 1708 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 1709 } else { 1710 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 1711 } 1712 ptap->AP_loc->info.mallocs = nspacedouble; 1713 ptap->AP_loc->info.fill_ratio_given = fill; 1714 ptap->AP_loc->info.fill_ratio_needed = apfill; 1715 1716 if (api[am]) { 1717 ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 1718 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 1719 } else { 1720 ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 1721 } 1722 #endif 1723 1724 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1725 /* ------------------------------------ */ 1726 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 1727 ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr); 1728 ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 1729 ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr); 1730 ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr); 1731 ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr); 1732 ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_");CHKERRQ(ierr); 1733 ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr); 1734 ierr = MatProductSetAlgorithm(ptap->C_oth,"default");CHKERRQ(ierr); 1735 ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr); 1736 ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr); 1737 ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr); 1738 1739 /* (3) send coj of C_oth to other processors */ 1740 /* ------------------------------------------ */ 1741 /* determine row ownership */ 1742 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 1743 rowmap->n = pn; 1744 rowmap->bs = 1; 1745 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 1746 owners = rowmap->range; 1747 1748 /* determine the number of messages to send, their lengths */ 1749 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 1750 ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr); 1751 ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr); 1752 1753 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 1754 coi = c_oth->i; coj = c_oth->j; 1755 con = ptap->C_oth->rmap->n; 1756 proc = 0; 1757 for (i=0; i<con; i++) { 1758 while (prmap[i] >= owners[proc+1]) proc++; 1759 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1760 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1761 } 1762 1763 len = 0; /* max length of buf_si[], see (4) */ 1764 owners_co[0] = 0; 1765 nsend = 0; 1766 for (proc=0; proc<size; proc++) { 1767 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1768 if (len_s[proc]) { 1769 nsend++; 1770 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1771 len += len_si[proc]; 1772 } 1773 } 1774 1775 /* determine the number and length of messages to receive for coi and coj */ 1776 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 1777 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 1778 1779 /* post the Irecv and Isend of coj */ 1780 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1781 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1782 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 1783 for (proc=0, k=0; proc<size; proc++) { 1784 if (!len_s[proc]) continue; 1785 i = owners_co[proc]; 1786 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRMPI(ierr); 1787 k++; 1788 } 1789 1790 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1791 /* ---------------------------------------- */ 1792 ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr); 1793 ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 1794 ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr); 1795 ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr); 1796 ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr); 1797 ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_");CHKERRQ(ierr); 1798 ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr); 1799 ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr); 1800 ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr); 1801 ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr); 1802 ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr); 1803 1804 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 1805 1806 /* receives coj are complete */ 1807 for (i=0; i<nrecv; i++) { 1808 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr); 1809 } 1810 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1811 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);} 1812 1813 /* add received column indices into ta to update Crmax */ 1814 for (k=0; k<nrecv; k++) {/* k-th received message */ 1815 Jptr = buf_rj[k]; 1816 for (j=0; j<len_r[k]; j++) { 1817 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 1818 } 1819 } 1820 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 1821 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 1822 1823 /* (4) send and recv coi */ 1824 /*-----------------------*/ 1825 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1826 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1827 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 1828 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1829 for (proc=0,k=0; proc<size; proc++) { 1830 if (!len_s[proc]) continue; 1831 /* form outgoing message for i-structure: 1832 buf_si[0]: nrows to be sent 1833 [1:nrows]: row index (global) 1834 [nrows+1:2*nrows+1]: i-structure index 1835 */ 1836 /*-------------------------------------------*/ 1837 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 1838 buf_si_i = buf_si + nrows+1; 1839 buf_si[0] = nrows; 1840 buf_si_i[0] = 0; 1841 nrows = 0; 1842 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1843 nzi = coi[i+1] - coi[i]; 1844 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1845 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1846 nrows++; 1847 } 1848 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRMPI(ierr); 1849 k++; 1850 buf_si += len_si[proc]; 1851 } 1852 for (i=0; i<nrecv; i++) { 1853 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr); 1854 } 1855 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1856 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);} 1857 1858 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 1859 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1860 ierr = PetscFree(swaits);CHKERRQ(ierr); 1861 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1862 1863 /* (5) compute the local portion of Cmpi */ 1864 /* ------------------------------------------ */ 1865 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1866 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 1867 current_space = free_space; 1868 1869 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 1870 for (k=0; k<nrecv; k++) { 1871 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1872 nrows = *buf_ri_k[k]; 1873 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1874 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1875 } 1876 1877 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 1878 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1879 for (i=0; i<pn; i++) { 1880 /* add C_loc into Cmpi */ 1881 nzi = c_loc->i[i+1] - c_loc->i[i]; 1882 Jptr = c_loc->j + c_loc->i[i]; 1883 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1884 1885 /* add received col data into lnk */ 1886 for (k=0; k<nrecv; k++) { /* k-th received message */ 1887 if (i == *nextrow[k]) { /* i-th row */ 1888 nzi = *(nextci[k]+1) - *nextci[k]; 1889 Jptr = buf_rj[k] + *nextci[k]; 1890 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1891 nextrow[k]++; nextci[k]++; 1892 } 1893 } 1894 nzi = lnk[0]; 1895 1896 /* copy data into free space, then initialize lnk */ 1897 ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1898 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 1899 } 1900 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1901 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1902 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 1903 1904 /* local sizes and preallocation */ 1905 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1906 if (P->cmap->bs > 0) { 1907 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 1908 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 1909 } 1910 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1911 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1912 1913 /* members in merge */ 1914 ierr = PetscFree(id_r);CHKERRQ(ierr); 1915 ierr = PetscFree(len_r);CHKERRQ(ierr); 1916 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 1917 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 1918 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 1919 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 1920 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 1921 1922 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 1923 1924 /* attach the supporting struct to Cmpi for reuse */ 1925 Cmpi->product->data = ptap; 1926 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1927 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1928 1929 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1930 Cmpi->assembled = PETSC_FALSE; 1931 PetscFunctionReturn(0); 1932 } 1933 1934 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 1935 { 1936 PetscErrorCode ierr; 1937 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 1938 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1939 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 1940 Mat_APMPI *ptap; 1941 Mat AP_loc,C_loc,C_oth; 1942 PetscInt i,rstart,rend,cm,ncols,row; 1943 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 1944 PetscScalar *apa; 1945 const PetscInt *cols; 1946 const PetscScalar *vals; 1947 1948 PetscFunctionBegin; 1949 MatCheckProduct(C,3); 1950 ptap = (Mat_APMPI*)C->product->data; 1951 if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 1952 if (!ptap->AP_loc) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 1953 1954 ierr = MatZeroEntries(C);CHKERRQ(ierr); 1955 /* 1) get R = Pd^T,Ro = Po^T */ 1956 if (ptap->reuse == MAT_REUSE_MATRIX) { 1957 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1958 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1959 } 1960 1961 /* 2) get AP_loc */ 1962 AP_loc = ptap->AP_loc; 1963 ap = (Mat_SeqAIJ*)AP_loc->data; 1964 1965 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1966 /*-----------------------------------------------------*/ 1967 if (ptap->reuse == MAT_REUSE_MATRIX) { 1968 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1969 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1970 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1971 } 1972 1973 /* 2-2) compute numeric A_loc*P - dominating part */ 1974 /* ---------------------------------------------- */ 1975 /* get data from symbolic products */ 1976 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1977 if (ptap->P_oth) { 1978 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1979 } 1980 apa = ptap->apa; 1981 api = ap->i; 1982 apj = ap->j; 1983 for (i=0; i<am; i++) { 1984 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1985 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 1986 apnz = api[i+1] - api[i]; 1987 for (j=0; j<apnz; j++) { 1988 col = apj[j+api[i]]; 1989 ap->a[j+ap->i[i]] = apa[col]; 1990 apa[col] = 0.0; 1991 } 1992 } 1993 /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */ 1994 ierr = PetscObjectStateIncrease((PetscObject)AP_loc);CHKERRQ(ierr); 1995 1996 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1997 ierr = MatProductNumeric(ptap->C_loc);CHKERRQ(ierr); 1998 ierr = MatProductNumeric(ptap->C_oth);CHKERRQ(ierr); 1999 C_loc = ptap->C_loc; 2000 C_oth = ptap->C_oth; 2001 2002 /* add C_loc and Co to to C */ 2003 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 2004 2005 /* C_loc -> C */ 2006 cm = C_loc->rmap->N; 2007 c_seq = (Mat_SeqAIJ*)C_loc->data; 2008 cols = c_seq->j; 2009 vals = c_seq->a; 2010 2011 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 2012 /* when there are no off-processor parts. */ 2013 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 2014 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 2015 /* a table, and other, more complex stuff has to be done. */ 2016 if (C->assembled) { 2017 C->was_assembled = PETSC_TRUE; 2018 C->assembled = PETSC_FALSE; 2019 } 2020 if (C->was_assembled) { 2021 for (i=0; i<cm; i++) { 2022 ncols = c_seq->i[i+1] - c_seq->i[i]; 2023 row = rstart + i; 2024 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2025 cols += ncols; vals += ncols; 2026 } 2027 } else { 2028 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 2029 } 2030 2031 /* Co -> C, off-processor part */ 2032 cm = C_oth->rmap->N; 2033 c_seq = (Mat_SeqAIJ*)C_oth->data; 2034 cols = c_seq->j; 2035 vals = c_seq->a; 2036 for (i=0; i<cm; i++) { 2037 ncols = c_seq->i[i+1] - c_seq->i[i]; 2038 row = p->garray[i]; 2039 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2040 cols += ncols; vals += ncols; 2041 } 2042 2043 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2044 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2045 2046 ptap->reuse = MAT_REUSE_MATRIX; 2047 PetscFunctionReturn(0); 2048 } 2049 2050 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) 2051 { 2052 PetscErrorCode ierr; 2053 Mat_Product *product = C->product; 2054 Mat A=product->A,P=product->B; 2055 MatProductAlgorithm alg=product->alg; 2056 PetscReal fill=product->fill; 2057 PetscBool flg; 2058 2059 PetscFunctionBegin; 2060 /* scalable: do R=P^T locally, then C=R*A*P */ 2061 ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr); 2062 if (flg) { 2063 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C);CHKERRQ(ierr); 2064 C->ops->productnumeric = MatProductNumeric_PtAP; 2065 goto next; 2066 } 2067 2068 /* nonscalable: do R=P^T locally, then C=R*A*P */ 2069 ierr = PetscStrcmp(alg,"nonscalable",&flg);CHKERRQ(ierr); 2070 if (flg) { 2071 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); 2072 goto next; 2073 } 2074 2075 /* allatonce */ 2076 ierr = PetscStrcmp(alg,"allatonce",&flg);CHKERRQ(ierr); 2077 if (flg) { 2078 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr); 2079 goto next; 2080 } 2081 2082 /* allatonce_merged */ 2083 ierr = PetscStrcmp(alg,"allatonce_merged",&flg);CHKERRQ(ierr); 2084 if (flg) { 2085 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr); 2086 goto next; 2087 } 2088 2089 /* backend general code */ 2090 ierr = PetscStrcmp(alg,"backend",&flg);CHKERRQ(ierr); 2091 if (flg) { 2092 ierr = MatProductSymbolic_MPIAIJBACKEND(C);CHKERRQ(ierr); 2093 PetscFunctionReturn(0); 2094 } 2095 2096 /* hypre */ 2097 #if defined(PETSC_HAVE_HYPRE) 2098 ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr); 2099 if (flg) { 2100 ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr); 2101 C->ops->productnumeric = MatProductNumeric_PtAP; 2102 PetscFunctionReturn(0); 2103 } 2104 #endif 2105 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 2106 2107 next: 2108 C->ops->productnumeric = MatProductNumeric_PtAP; 2109 PetscFunctionReturn(0); 2110 } 2111