1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmapiv.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer) 17 { 18 PetscErrorCode ierr; 19 PetscBool iascii; 20 PetscViewerFormat format; 21 Mat_APMPI *ptap; 22 23 PetscFunctionBegin; 24 MatCheckProduct(A,1); 25 ptap = (Mat_APMPI*)A->product->data; 26 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 27 if (iascii) { 28 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 29 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 30 if (ptap->algType == 0) { 31 ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr); 32 } else if (ptap->algType == 1) { 33 ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr); 34 } else if (ptap->algType == 2) { 35 ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 36 } else if (ptap->algType == 3) { 37 ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 38 } 39 } 40 } 41 PetscFunctionReturn(0); 42 } 43 44 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) 45 { 46 PetscErrorCode ierr; 47 Mat_APMPI *ptap = (Mat_APMPI*)data; 48 Mat_Merge_SeqsToMPI *merge; 49 50 PetscFunctionBegin; 51 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 52 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 53 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 54 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 55 ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */ 56 ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr); 57 ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr); 58 if (ptap->AP_loc) { /* used by alg_rap */ 59 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 60 ierr = PetscFree(ap->i);CHKERRQ(ierr); 61 ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr); 62 ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr); 63 } else { /* used by alg_ptap */ 64 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 65 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 66 } 67 ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr); 68 ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr); 69 if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);} 70 71 ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr); 72 73 merge = ptap->merge; 74 if (merge) { /* used by alg_ptap */ 75 ierr = PetscFree(merge->id_r);CHKERRQ(ierr); 76 ierr = PetscFree(merge->len_s);CHKERRQ(ierr); 77 ierr = PetscFree(merge->len_r);CHKERRQ(ierr); 78 ierr = PetscFree(merge->bi);CHKERRQ(ierr); 79 ierr = PetscFree(merge->bj);CHKERRQ(ierr); 80 ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr); 81 ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr); 82 ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr); 83 ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr); 84 ierr = PetscFree(merge->coi);CHKERRQ(ierr); 85 ierr = PetscFree(merge->coj);CHKERRQ(ierr); 86 ierr = PetscFree(merge->owners_co);CHKERRQ(ierr); 87 ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr); 88 ierr = PetscFree(ptap->merge);CHKERRQ(ierr); 89 } 90 ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr); 91 92 ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr); 93 ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr); 94 ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr); 95 ierr = PetscFree(ptap);CHKERRQ(ierr); 96 PetscFunctionReturn(0); 97 } 98 99 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 100 { 101 PetscErrorCode ierr; 102 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 103 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 104 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 105 Mat_APMPI *ptap; 106 Mat AP_loc,C_loc,C_oth; 107 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout; 108 PetscScalar *apa; 109 const PetscInt *cols; 110 const PetscScalar *vals; 111 112 PetscFunctionBegin; 113 MatCheckProduct(C,3); 114 ptap = (Mat_APMPI*)C->product->data; 115 PetscCheckFalse(!ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 116 PetscCheckFalse(!ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 117 118 ierr = MatZeroEntries(C);CHKERRQ(ierr); 119 120 /* 1) get R = Pd^T,Ro = Po^T */ 121 if (ptap->reuse == MAT_REUSE_MATRIX) { 122 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 123 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 124 } 125 126 /* 2) get AP_loc */ 127 AP_loc = ptap->AP_loc; 128 ap = (Mat_SeqAIJ*)AP_loc->data; 129 130 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 131 /*-----------------------------------------------------*/ 132 if (ptap->reuse == MAT_REUSE_MATRIX) { 133 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 134 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 135 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 136 } 137 138 /* 2-2) compute numeric A_loc*P - dominating part */ 139 /* ---------------------------------------------- */ 140 /* get data from symbolic products */ 141 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 142 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 143 144 api = ap->i; 145 apj = ap->j; 146 ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr); 147 for (i=0; i<am; i++) { 148 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 149 apnz = api[i+1] - api[i]; 150 apa = ap->a + api[i]; 151 ierr = PetscArrayzero(apa,apnz);CHKERRQ(ierr); 152 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 153 } 154 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr); 155 PetscCheckFalse(api[AP_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,api[AP_loc->rmap->n],nout); 156 157 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 158 /* Always use scalable version since we are in the MPI scalable version */ 159 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 160 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 161 162 C_loc = ptap->C_loc; 163 C_oth = ptap->C_oth; 164 165 /* add C_loc and Co to to C */ 166 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 167 168 /* C_loc -> C */ 169 cm = C_loc->rmap->N; 170 c_seq = (Mat_SeqAIJ*)C_loc->data; 171 cols = c_seq->j; 172 vals = c_seq->a; 173 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 174 175 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 176 /* when there are no off-processor parts. */ 177 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 178 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 179 /* a table, and other, more complex stuff has to be done. */ 180 if (C->assembled) { 181 C->was_assembled = PETSC_TRUE; 182 C->assembled = PETSC_FALSE; 183 } 184 if (C->was_assembled) { 185 for (i=0; i<cm; i++) { 186 ncols = c_seq->i[i+1] - c_seq->i[i]; 187 row = rstart + i; 188 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 189 cols += ncols; vals += ncols; 190 } 191 } else { 192 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 193 } 194 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 195 PetscCheckFalse(c_seq->i[C_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout); 196 197 /* Co -> C, off-processor part */ 198 cm = C_oth->rmap->N; 199 c_seq = (Mat_SeqAIJ*)C_oth->data; 200 cols = c_seq->j; 201 vals = c_seq->a; 202 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 203 for (i=0; i<cm; i++) { 204 ncols = c_seq->i[i+1] - c_seq->i[i]; 205 row = p->garray[i]; 206 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 207 cols += ncols; vals += ncols; 208 } 209 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 210 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 211 212 ptap->reuse = MAT_REUSE_MATRIX; 213 214 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 215 PetscCheckFalse(c_seq->i[C_oth->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout); 216 PetscFunctionReturn(0); 217 } 218 219 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi) 220 { 221 PetscErrorCode ierr; 222 Mat_APMPI *ptap; 223 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 224 MPI_Comm comm; 225 PetscMPIInt size,rank; 226 Mat P_loc,P_oth; 227 PetscFreeSpaceList free_space=NULL,current_space=NULL; 228 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 229 PetscInt *lnk,i,k,pnz,row,nsend; 230 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,nrecv; 231 PETSC_UNUSED PetscMPIInt icompleted=0; 232 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 233 const PetscInt *owners; 234 PetscInt len,proc,*dnz,*onz,nzi,nspacedouble; 235 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 236 MPI_Request *swaits,*rwaits; 237 MPI_Status *sstatus,rstatus; 238 PetscLayout rowmap; 239 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 240 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 241 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout; 242 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 243 PetscScalar *apv; 244 PetscTable ta; 245 MatType mtype; 246 const char *prefix; 247 #if defined(PETSC_USE_INFO) 248 PetscReal apfill; 249 #endif 250 251 PetscFunctionBegin; 252 MatCheckProduct(Cmpi,4); 253 PetscCheckFalse(Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 254 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 255 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 256 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 257 258 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 259 260 /* create symbolic parallel matrix Cmpi */ 261 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 262 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 263 264 /* create struct Mat_APMPI and attached it to C later */ 265 ierr = PetscNew(&ptap);CHKERRQ(ierr); 266 ptap->reuse = MAT_INITIAL_MATRIX; 267 ptap->algType = 0; 268 269 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 270 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr); 271 /* get P_loc by taking all local rows of P */ 272 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr); 273 274 ptap->P_loc = P_loc; 275 ptap->P_oth = P_oth; 276 277 /* (0) compute Rd = Pd^T, Ro = Po^T */ 278 /* --------------------------------- */ 279 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 280 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 281 282 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 283 /* ----------------------------------------------------------------- */ 284 p_loc = (Mat_SeqAIJ*)P_loc->data; 285 if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data; 286 287 /* create and initialize a linked list */ 288 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 289 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 290 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 291 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 292 293 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 294 295 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 296 if (ao) { 297 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 298 } else { 299 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 300 } 301 current_space = free_space; 302 nspacedouble = 0; 303 304 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 305 api[0] = 0; 306 for (i=0; i<am; i++) { 307 /* diagonal portion: Ad[i,:]*P */ 308 ai = ad->i; pi = p_loc->i; 309 nzi = ai[i+1] - ai[i]; 310 aj = ad->j + ai[i]; 311 for (j=0; j<nzi; j++) { 312 row = aj[j]; 313 pnz = pi[row+1] - pi[row]; 314 Jptr = p_loc->j + pi[row]; 315 /* add non-zero cols of P into the sorted linked list lnk */ 316 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 317 } 318 /* off-diagonal portion: Ao[i,:]*P */ 319 if (ao) { 320 ai = ao->i; pi = p_oth->i; 321 nzi = ai[i+1] - ai[i]; 322 aj = ao->j + ai[i]; 323 for (j=0; j<nzi; j++) { 324 row = aj[j]; 325 pnz = pi[row+1] - pi[row]; 326 Jptr = p_oth->j + pi[row]; 327 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 328 } 329 } 330 apnz = lnk[0]; 331 api[i+1] = api[i] + apnz; 332 333 /* if free space is not available, double the total space in the list */ 334 if (current_space->local_remaining<apnz) { 335 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 336 nspacedouble++; 337 } 338 339 /* Copy data into free space, then initialize lnk */ 340 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 341 342 current_space->array += apnz; 343 current_space->local_used += apnz; 344 current_space->local_remaining -= apnz; 345 } 346 /* Allocate space for apj and apv, initialize apj, and */ 347 /* destroy list of free space and other temporary array(s) */ 348 ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 349 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 350 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 351 352 /* Create AP_loc for reuse */ 353 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 354 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr); 355 356 #if defined(PETSC_USE_INFO) 357 if (ao) { 358 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 359 } else { 360 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 361 } 362 ptap->AP_loc->info.mallocs = nspacedouble; 363 ptap->AP_loc->info.fill_ratio_given = fill; 364 ptap->AP_loc->info.fill_ratio_needed = apfill; 365 366 if (api[am]) { 367 ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 368 ierr = PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 369 } else { 370 ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 371 } 372 #endif 373 374 /* (2-1) compute symbolic Co = Ro*AP_loc */ 375 /* -------------------------------------- */ 376 ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr); 377 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 378 ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr); 379 ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_");CHKERRQ(ierr); 380 381 ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr); 382 ierr = MatProductSetAlgorithm(ptap->C_oth,"sorted");CHKERRQ(ierr); 383 ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr); 384 ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr); 385 ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr); 386 387 /* (3) send coj of C_oth to other processors */ 388 /* ------------------------------------------ */ 389 /* determine row ownership */ 390 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 391 ierr = PetscLayoutSetLocalSize(rowmap, pn);CHKERRQ(ierr); 392 ierr = PetscLayoutSetBlockSize(rowmap, 1);CHKERRQ(ierr); 393 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 394 ierr = PetscLayoutGetRanges(rowmap,&owners);CHKERRQ(ierr); 395 396 /* determine the number of messages to send, their lengths */ 397 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 398 ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr); 399 ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr); 400 401 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 402 coi = c_oth->i; coj = c_oth->j; 403 con = ptap->C_oth->rmap->n; 404 proc = 0; 405 ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr); 406 for (i=0; i<con; i++) { 407 while (prmap[i] >= owners[proc+1]) proc++; 408 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 409 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 410 } 411 412 len = 0; /* max length of buf_si[], see (4) */ 413 owners_co[0] = 0; 414 nsend = 0; 415 for (proc=0; proc<size; proc++) { 416 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 417 if (len_s[proc]) { 418 nsend++; 419 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 420 len += len_si[proc]; 421 } 422 } 423 424 /* determine the number and length of messages to receive for coi and coj */ 425 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 426 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 427 428 /* post the Irecv and Isend of coj */ 429 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 430 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 431 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 432 for (proc=0, k=0; proc<size; proc++) { 433 if (!len_s[proc]) continue; 434 i = owners_co[proc]; 435 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRMPI(ierr); 436 k++; 437 } 438 439 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 440 /* ---------------------------------------- */ 441 ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr); 442 ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr); 443 ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr); 444 ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr); 445 446 ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr); 447 ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_");CHKERRQ(ierr); 448 449 ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr); 450 ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr); 451 452 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 453 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr); 454 455 /* receives coj are complete */ 456 for (i=0; i<nrecv; i++) { 457 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr); 458 } 459 ierr = PetscFree(rwaits);CHKERRQ(ierr); 460 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);} 461 462 /* add received column indices into ta to update Crmax */ 463 for (k=0; k<nrecv; k++) {/* k-th received message */ 464 Jptr = buf_rj[k]; 465 for (j=0; j<len_r[k]; j++) { 466 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 467 } 468 } 469 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 470 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 471 472 /* (4) send and recv coi */ 473 /*-----------------------*/ 474 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 475 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 476 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 477 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 478 for (proc=0,k=0; proc<size; proc++) { 479 if (!len_s[proc]) continue; 480 /* form outgoing message for i-structure: 481 buf_si[0]: nrows to be sent 482 [1:nrows]: row index (global) 483 [nrows+1:2*nrows+1]: i-structure index 484 */ 485 /*-------------------------------------------*/ 486 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 487 buf_si_i = buf_si + nrows+1; 488 buf_si[0] = nrows; 489 buf_si_i[0] = 0; 490 nrows = 0; 491 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 492 nzi = coi[i+1] - coi[i]; 493 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 494 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 495 nrows++; 496 } 497 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRMPI(ierr); 498 k++; 499 buf_si += len_si[proc]; 500 } 501 for (i=0; i<nrecv; i++) { 502 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr); 503 } 504 ierr = PetscFree(rwaits);CHKERRQ(ierr); 505 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);} 506 507 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 508 ierr = PetscFree(len_ri);CHKERRQ(ierr); 509 ierr = PetscFree(swaits);CHKERRQ(ierr); 510 ierr = PetscFree(buf_s);CHKERRQ(ierr); 511 512 /* (5) compute the local portion of Cmpi */ 513 /* ------------------------------------------ */ 514 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 515 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 516 current_space = free_space; 517 518 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 519 for (k=0; k<nrecv; k++) { 520 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 521 nrows = *buf_ri_k[k]; 522 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 523 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 524 } 525 526 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 527 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 528 for (i=0; i<pn; i++) { 529 /* add C_loc into Cmpi */ 530 nzi = c_loc->i[i+1] - c_loc->i[i]; 531 Jptr = c_loc->j + c_loc->i[i]; 532 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 533 534 /* add received col data into lnk */ 535 for (k=0; k<nrecv; k++) { /* k-th received message */ 536 if (i == *nextrow[k]) { /* i-th row */ 537 nzi = *(nextci[k]+1) - *nextci[k]; 538 Jptr = buf_rj[k] + *nextci[k]; 539 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 540 nextrow[k]++; nextci[k]++; 541 } 542 } 543 nzi = lnk[0]; 544 545 /* copy data into free space, then initialize lnk */ 546 ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr); 547 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 548 } 549 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 550 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 551 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 552 553 /* local sizes and preallocation */ 554 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 555 if (P->cmap->bs > 0) { 556 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 557 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 558 } 559 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 560 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 561 562 /* members in merge */ 563 ierr = PetscFree(id_r);CHKERRQ(ierr); 564 ierr = PetscFree(len_r);CHKERRQ(ierr); 565 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 566 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 567 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 568 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 569 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 570 571 nout = 0; 572 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr); 573 PetscCheckFalse(c_oth->i[ptap->C_oth->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_oth->i[ptap->C_oth->rmap->n],nout); 574 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr); 575 PetscCheckFalse(c_loc->i[ptap->C_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_loc->i[ptap->C_loc->rmap->n],nout); 576 577 /* attach the supporting struct to Cmpi for reuse */ 578 Cmpi->product->data = ptap; 579 Cmpi->product->view = MatView_MPIAIJ_PtAP; 580 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 581 582 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 583 Cmpi->assembled = PETSC_FALSE; 584 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 585 PetscFunctionReturn(0); 586 } 587 588 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht) 589 { 590 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 591 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 592 PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k; 593 PetscInt pcstart,pcend,column,offset; 594 PetscErrorCode ierr; 595 596 PetscFunctionBegin; 597 pcstart = P->cmap->rstart; 598 pcstart *= dof; 599 pcend = P->cmap->rend; 600 pcend *= dof; 601 /* diagonal portion: Ad[i,:]*P */ 602 ai = ad->i; 603 nzi = ai[i+1] - ai[i]; 604 aj = ad->j + ai[i]; 605 for (j=0; j<nzi; j++) { 606 row = aj[j]; 607 offset = row%dof; 608 row /= dof; 609 nzpi = pd->i[row+1] - pd->i[row]; 610 pj = pd->j + pd->i[row]; 611 for (k=0; k<nzpi; k++) { 612 ierr = PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);CHKERRQ(ierr); 613 } 614 } 615 /* off diag P */ 616 for (j=0; j<nzi; j++) { 617 row = aj[j]; 618 offset = row%dof; 619 row /= dof; 620 nzpi = po->i[row+1] - po->i[row]; 621 pj = po->j + po->i[row]; 622 for (k=0; k<nzpi; k++) { 623 ierr = PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);CHKERRQ(ierr); 624 } 625 } 626 627 /* off diagonal part: Ao[i, :]*P_oth */ 628 if (ao) { 629 ai = ao->i; 630 pi = p_oth->i; 631 nzi = ai[i+1] - ai[i]; 632 aj = ao->j + ai[i]; 633 for (j=0; j<nzi; j++) { 634 row = aj[j]; 635 offset = a->garray[row]%dof; 636 row = map[row]; 637 pnz = pi[row+1] - pi[row]; 638 p_othcols = p_oth->j + pi[row]; 639 for (col=0; col<pnz; col++) { 640 column = p_othcols[col] * dof + offset; 641 if (column>=pcstart && column<pcend) { 642 ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr); 643 } else { 644 ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr); 645 } 646 } 647 } 648 } /* end if (ao) */ 649 PetscFunctionReturn(0); 650 } 651 652 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap) 653 { 654 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 655 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 656 PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset; 657 PetscScalar ra,*aa,*pa; 658 PetscErrorCode ierr; 659 660 PetscFunctionBegin; 661 pcstart = P->cmap->rstart; 662 pcstart *= dof; 663 664 /* diagonal portion: Ad[i,:]*P */ 665 ai = ad->i; 666 nzi = ai[i+1] - ai[i]; 667 aj = ad->j + ai[i]; 668 aa = ad->a + ai[i]; 669 for (j=0; j<nzi; j++) { 670 ra = aa[j]; 671 row = aj[j]; 672 offset = row%dof; 673 row /= dof; 674 nzpi = pd->i[row+1] - pd->i[row]; 675 pj = pd->j + pd->i[row]; 676 pa = pd->a + pd->i[row]; 677 for (k=0; k<nzpi; k++) { 678 ierr = PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);CHKERRQ(ierr); 679 } 680 ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr); 681 } 682 for (j=0; j<nzi; j++) { 683 ra = aa[j]; 684 row = aj[j]; 685 offset = row%dof; 686 row /= dof; 687 nzpi = po->i[row+1] - po->i[row]; 688 pj = po->j + po->i[row]; 689 pa = po->a + po->i[row]; 690 for (k=0; k<nzpi; k++) { 691 ierr = PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);CHKERRQ(ierr); 692 } 693 ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr); 694 } 695 696 /* off diagonal part: Ao[i, :]*P_oth */ 697 if (ao) { 698 ai = ao->i; 699 pi = p_oth->i; 700 nzi = ai[i+1] - ai[i]; 701 aj = ao->j + ai[i]; 702 aa = ao->a + ai[i]; 703 for (j=0; j<nzi; j++) { 704 row = aj[j]; 705 offset = a->garray[row]%dof; 706 row = map[row]; 707 ra = aa[j]; 708 pnz = pi[row+1] - pi[row]; 709 p_othcols = p_oth->j + pi[row]; 710 pa = p_oth->a + pi[row]; 711 for (col=0; col<pnz; col++) { 712 ierr = PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);CHKERRQ(ierr); 713 } 714 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 715 } 716 } /* end if (ao) */ 717 718 PetscFunctionReturn(0); 719 } 720 721 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*); 722 723 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C) 724 { 725 PetscErrorCode ierr; 726 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 727 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 728 Mat_APMPI *ptap; 729 PetscHMapIV hmap; 730 PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc; 731 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 732 PetscInt offset,ii,pocol; 733 const PetscInt *mappingindices; 734 IS map; 735 736 PetscFunctionBegin; 737 MatCheckProduct(C,4); 738 ptap = (Mat_APMPI*)C->product->data; 739 PetscCheckFalse(!ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 740 PetscCheckFalse(!ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 741 742 ierr = MatZeroEntries(C);CHKERRQ(ierr); 743 744 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 745 /*-----------------------------------------------------*/ 746 if (ptap->reuse == MAT_REUSE_MATRIX) { 747 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 748 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 749 } 750 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 751 752 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 753 pon *= dof; 754 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 755 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 756 cmaxr = 0; 757 for (i=0; i<pon; i++) { 758 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 759 } 760 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 761 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 762 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 763 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 764 for (i=0; i<am && pon; i++) { 765 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 766 offset = i%dof; 767 ii = i/dof; 768 nzi = po->i[ii+1] - po->i[ii]; 769 if (!nzi) continue; 770 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 771 voff = 0; 772 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 773 if (!voff) continue; 774 775 /* Form C(ii, :) */ 776 poj = po->j + po->i[ii]; 777 poa = po->a + po->i[ii]; 778 for (j=0; j<nzi; j++) { 779 pocol = poj[j]*dof+offset; 780 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 781 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 782 for (jj=0; jj<voff; jj++) { 783 apvaluestmp[jj] = apvalues[jj]*poa[j]; 784 /*If the row is empty */ 785 if (!c_rmtc[pocol]) { 786 c_rmtjj[jj] = apindices[jj]; 787 c_rmtaa[jj] = apvaluestmp[jj]; 788 c_rmtc[pocol]++; 789 } else { 790 ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr); 791 if (loc>=0){ /* hit */ 792 c_rmtaa[loc] += apvaluestmp[jj]; 793 ierr = PetscLogFlops(1.0);CHKERRQ(ierr); 794 } else { /* new element */ 795 loc = -(loc+1); 796 /* Move data backward */ 797 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 798 c_rmtjj[kk] = c_rmtjj[kk-1]; 799 c_rmtaa[kk] = c_rmtaa[kk-1]; 800 }/* End kk */ 801 c_rmtjj[loc] = apindices[jj]; 802 c_rmtaa[loc] = apvaluestmp[jj]; 803 c_rmtc[pocol]++; 804 } 805 } 806 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 807 } /* End jj */ 808 } /* End j */ 809 } /* End i */ 810 811 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 812 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 813 814 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 815 pn *= dof; 816 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 817 818 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 819 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr); 820 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 821 pcstart = pcstart*dof; 822 pcend = pcend*dof; 823 cd = (Mat_SeqAIJ*)(c->A)->data; 824 co = (Mat_SeqAIJ*)(c->B)->data; 825 826 cmaxr = 0; 827 for (i=0; i<pn; i++) { 828 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 829 } 830 ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr); 831 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 832 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 833 for (i=0; i<am && pn; i++) { 834 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 835 offset = i%dof; 836 ii = i/dof; 837 nzi = pd->i[ii+1] - pd->i[ii]; 838 if (!nzi) continue; 839 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 840 voff = 0; 841 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 842 if (!voff) continue; 843 /* Form C(ii, :) */ 844 pdj = pd->j + pd->i[ii]; 845 pda = pd->a + pd->i[ii]; 846 for (j=0; j<nzi; j++) { 847 row = pcstart + pdj[j] * dof + offset; 848 for (jj=0; jj<voff; jj++) { 849 apvaluestmp[jj] = apvalues[jj]*pda[j]; 850 } 851 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 852 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 853 } 854 } 855 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 856 ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr); 857 ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr); 858 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 859 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 860 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr); 861 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 862 863 /* Add contributions from remote */ 864 for (i = 0; i < pn; i++) { 865 row = i + pcstart; 866 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 867 } 868 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 869 870 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 871 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 872 873 ptap->reuse = MAT_REUSE_MATRIX; 874 PetscFunctionReturn(0); 875 } 876 877 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C) 878 { 879 PetscErrorCode ierr; 880 881 PetscFunctionBegin; 882 883 ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);CHKERRQ(ierr); 884 PetscFunctionReturn(0); 885 } 886 887 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C) 888 { 889 PetscErrorCode ierr; 890 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 891 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 892 Mat_APMPI *ptap; 893 PetscHMapIV hmap; 894 PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc; 895 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 896 PetscInt offset,ii,pocol; 897 const PetscInt *mappingindices; 898 IS map; 899 900 PetscFunctionBegin; 901 MatCheckProduct(C,4); 902 ptap = (Mat_APMPI*)C->product->data; 903 PetscCheckFalse(!ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 904 PetscCheckFalse(!ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 905 906 ierr = MatZeroEntries(C);CHKERRQ(ierr); 907 908 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 909 /*-----------------------------------------------------*/ 910 if (ptap->reuse == MAT_REUSE_MATRIX) { 911 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 912 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 913 } 914 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 915 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 916 pon *= dof; 917 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 918 pn *= dof; 919 920 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 921 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 922 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 923 pcstart *= dof; 924 pcend *= dof; 925 cmaxr = 0; 926 for (i=0; i<pon; i++) { 927 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 928 } 929 cd = (Mat_SeqAIJ*)(c->A)->data; 930 co = (Mat_SeqAIJ*)(c->B)->data; 931 for (i=0; i<pn; i++) { 932 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 933 } 934 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 935 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 936 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 937 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 938 for (i=0; i<am && (pon || pn); i++) { 939 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 940 offset = i%dof; 941 ii = i/dof; 942 nzi = po->i[ii+1] - po->i[ii]; 943 dnzi = pd->i[ii+1] - pd->i[ii]; 944 if (!nzi && !dnzi) continue; 945 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 946 voff = 0; 947 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 948 if (!voff) continue; 949 950 /* Form remote C(ii, :) */ 951 poj = po->j + po->i[ii]; 952 poa = po->a + po->i[ii]; 953 for (j=0; j<nzi; j++) { 954 pocol = poj[j]*dof+offset; 955 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 956 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 957 for (jj=0; jj<voff; jj++) { 958 apvaluestmp[jj] = apvalues[jj]*poa[j]; 959 /*If the row is empty */ 960 if (!c_rmtc[pocol]) { 961 c_rmtjj[jj] = apindices[jj]; 962 c_rmtaa[jj] = apvaluestmp[jj]; 963 c_rmtc[pocol]++; 964 } else { 965 ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr); 966 if (loc>=0){ /* hit */ 967 c_rmtaa[loc] += apvaluestmp[jj]; 968 ierr = PetscLogFlops(1.0);CHKERRQ(ierr); 969 } else { /* new element */ 970 loc = -(loc+1); 971 /* Move data backward */ 972 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 973 c_rmtjj[kk] = c_rmtjj[kk-1]; 974 c_rmtaa[kk] = c_rmtaa[kk-1]; 975 }/* End kk */ 976 c_rmtjj[loc] = apindices[jj]; 977 c_rmtaa[loc] = apvaluestmp[jj]; 978 c_rmtc[pocol]++; 979 } 980 } 981 } /* End jj */ 982 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 983 } /* End j */ 984 985 /* Form local C(ii, :) */ 986 pdj = pd->j + pd->i[ii]; 987 pda = pd->a + pd->i[ii]; 988 for (j=0; j<dnzi; j++) { 989 row = pcstart + pdj[j] * dof + offset; 990 for (jj=0; jj<voff; jj++) { 991 apvaluestmp[jj] = apvalues[jj]*pda[j]; 992 }/* End kk */ 993 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 994 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 995 }/* End j */ 996 } /* End i */ 997 998 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 999 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 1000 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 1001 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 1002 1003 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1004 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr); 1005 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1006 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr); 1007 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 1008 1009 /* Add contributions from remote */ 1010 for (i = 0; i < pn; i++) { 1011 row = i + pcstart; 1012 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 1013 } 1014 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 1015 1016 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1017 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1018 1019 ptap->reuse = MAT_REUSE_MATRIX; 1020 PetscFunctionReturn(0); 1021 } 1022 1023 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C) 1024 { 1025 PetscErrorCode ierr; 1026 1027 PetscFunctionBegin; 1028 1029 ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);CHKERRQ(ierr); 1030 PetscFunctionReturn(0); 1031 } 1032 1033 /* TODO: move algorithm selection to MatProductSetFromOptions */ 1034 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi) 1035 { 1036 Mat_APMPI *ptap; 1037 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data; 1038 MPI_Comm comm; 1039 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1040 MatType mtype; 1041 PetscSF sf; 1042 PetscSFNode *iremote; 1043 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1044 const PetscInt *rootdegrees; 1045 PetscHSetI ht,oht,*hta,*hto; 1046 PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1047 PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1048 PetscInt nalg=2,alg=0,offset,ii; 1049 PetscMPIInt owner; 1050 const PetscInt *mappingindices; 1051 PetscBool flg; 1052 const char *algTypes[2] = {"overlapping","merged"}; 1053 IS map; 1054 PetscErrorCode ierr; 1055 1056 PetscFunctionBegin; 1057 MatCheckProduct(Cmpi,5); 1058 PetscCheckFalse(Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 1059 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1060 1061 /* Create symbolic parallel matrix Cmpi */ 1062 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1063 pn *= dof; 1064 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1065 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1066 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1067 1068 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1069 ptap->reuse = MAT_INITIAL_MATRIX; 1070 ptap->algType = 2; 1071 1072 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1073 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 1074 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 1075 /* This equals to the number of offdiag columns in P */ 1076 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1077 pon *= dof; 1078 /* offsets */ 1079 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1080 /* The number of columns we will send to remote ranks */ 1081 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1082 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1083 for (i=0; i<pon; i++) { 1084 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1085 } 1086 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1087 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1088 /* Create hash table to merge all columns for C(i, :) */ 1089 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1090 1091 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 1092 ptap->c_rmti[0] = 0; 1093 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1094 for (i=0; i<am && pon; i++) { 1095 /* Form one row of AP */ 1096 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1097 offset = i%dof; 1098 ii = i/dof; 1099 /* If the off diag is empty, we should not do any calculation */ 1100 nzi = po->i[ii+1] - po->i[ii]; 1101 if (!nzi) continue; 1102 1103 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);CHKERRQ(ierr); 1104 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1105 /* If AP is empty, just continue */ 1106 if (!htsize) continue; 1107 /* Form C(ii, :) */ 1108 poj = po->j + po->i[ii]; 1109 for (j=0; j<nzi; j++) { 1110 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr); 1111 } 1112 } 1113 1114 for (i=0; i<pon; i++) { 1115 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1116 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1117 c_rmtc[i] = htsize; 1118 } 1119 1120 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1121 1122 for (i=0; i<pon; i++) { 1123 off = 0; 1124 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1125 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1126 } 1127 ierr = PetscFree(hta);CHKERRQ(ierr); 1128 1129 ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr); 1130 for (i=0; i<pon; i++) { 1131 owner = 0; lidx = 0; 1132 offset = i%dof; 1133 ii = i/dof; 1134 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr); 1135 iremote[i].index = lidx*dof + offset; 1136 iremote[i].rank = owner; 1137 } 1138 1139 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1140 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1141 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1142 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1143 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1144 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1145 /* How many neighbors have contributions to my rows? */ 1146 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1147 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1148 rootspacesize = 0; 1149 for (i = 0; i < pn; i++) { 1150 rootspacesize += rootdegrees[i]; 1151 } 1152 ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1153 ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1154 /* Get information from leaves 1155 * Number of columns other people contribute to my rows 1156 * */ 1157 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1158 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1159 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1160 ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1161 /* The number of columns is received for each row */ 1162 ptap->c_othi[0] = 0; 1163 rootspacesize = 0; 1164 rootspaceoffsets[0] = 0; 1165 for (i = 0; i < pn; i++) { 1166 rcvncols = 0; 1167 for (j = 0; j<rootdegrees[i]; j++) { 1168 rcvncols += rootspace[rootspacesize]; 1169 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1170 rootspacesize++; 1171 } 1172 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1173 } 1174 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1175 1176 ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1177 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1178 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1179 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1180 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1181 1182 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1183 nleaves = 0; 1184 for (i = 0; i<pon; i++) { 1185 owner = 0; 1186 ii = i/dof; 1187 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr); 1188 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1189 for (j=0; j<sendncols; j++) { 1190 iremote[nleaves].rank = owner; 1191 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1192 } 1193 } 1194 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1195 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1196 1197 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1198 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1199 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1200 /* One to one map */ 1201 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1202 1203 ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1204 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1205 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1206 pcstart *= dof; 1207 pcend *= dof; 1208 ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr); 1209 for (i=0; i<pn; i++) { 1210 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1211 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1212 } 1213 /* Work on local part */ 1214 /* 4) Pass 1: Estimate memory for C_loc */ 1215 for (i=0; i<am && pn; i++) { 1216 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1217 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1218 offset = i%dof; 1219 ii = i/dof; 1220 nzi = pd->i[ii+1] - pd->i[ii]; 1221 if (!nzi) continue; 1222 1223 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr); 1224 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1225 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1226 if (!(htsize+htosize)) continue; 1227 /* Form C(ii, :) */ 1228 pdj = pd->j + pd->i[ii]; 1229 for (j=0; j<nzi; j++) { 1230 ierr = PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);CHKERRQ(ierr); 1231 ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr); 1232 } 1233 } 1234 1235 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1236 1237 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1238 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1239 1240 /* Get remote data */ 1241 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1242 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1243 1244 for (i = 0; i < pn; i++) { 1245 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1246 rdj = c_othj + ptap->c_othi[i]; 1247 for (j = 0; j < nzi; j++) { 1248 col = rdj[j]; 1249 /* diag part */ 1250 if (col>=pcstart && col<pcend) { 1251 ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr); 1252 } else { /* off diag */ 1253 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1254 } 1255 } 1256 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1257 dnz[i] = htsize; 1258 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1259 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1260 onz[i] = htsize; 1261 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1262 } 1263 1264 ierr = PetscFree2(hta,hto);CHKERRQ(ierr); 1265 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1266 1267 /* local sizes and preallocation */ 1268 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1269 ierr = MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr); 1270 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1271 ierr = MatSetUp(Cmpi);CHKERRQ(ierr); 1272 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1273 1274 /* attach the supporting struct to Cmpi for reuse */ 1275 Cmpi->product->data = ptap; 1276 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1277 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1278 1279 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1280 Cmpi->assembled = PETSC_FALSE; 1281 /* pick an algorithm */ 1282 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1283 alg = 0; 1284 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1285 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1286 switch (alg) { 1287 case 0: 1288 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1289 break; 1290 case 1: 1291 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1292 break; 1293 default: 1294 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm "); 1295 } 1296 PetscFunctionReturn(0); 1297 } 1298 1299 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C) 1300 { 1301 PetscErrorCode ierr; 1302 1303 PetscFunctionBegin; 1304 ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);CHKERRQ(ierr); 1305 PetscFunctionReturn(0); 1306 } 1307 1308 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi) 1309 { 1310 Mat_APMPI *ptap; 1311 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data; 1312 MPI_Comm comm; 1313 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1314 MatType mtype; 1315 PetscSF sf; 1316 PetscSFNode *iremote; 1317 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1318 const PetscInt *rootdegrees; 1319 PetscHSetI ht,oht,*hta,*hto,*htd; 1320 PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1321 PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1322 PetscInt nalg=2,alg=0,offset,ii; 1323 PetscMPIInt owner; 1324 PetscBool flg; 1325 const char *algTypes[2] = {"merged","overlapping"}; 1326 const PetscInt *mappingindices; 1327 IS map; 1328 PetscErrorCode ierr; 1329 1330 PetscFunctionBegin; 1331 MatCheckProduct(Cmpi,5); 1332 PetscCheckFalse(Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 1333 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1334 1335 /* Create symbolic parallel matrix Cmpi */ 1336 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1337 pn *= dof; 1338 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1339 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1340 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1341 1342 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1343 ptap->reuse = MAT_INITIAL_MATRIX; 1344 ptap->algType = 3; 1345 1346 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1347 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 1348 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 1349 1350 /* This equals to the number of offdiag columns in P */ 1351 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1352 pon *= dof; 1353 /* offsets */ 1354 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1355 /* The number of columns we will send to remote ranks */ 1356 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1357 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1358 for (i=0; i<pon; i++) { 1359 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1360 } 1361 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1362 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1363 /* Create hash table to merge all columns for C(i, :) */ 1364 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1365 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1366 ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr); 1367 for (i=0; i<pn; i++) { 1368 ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr); 1369 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1370 } 1371 1372 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 1373 ptap->c_rmti[0] = 0; 1374 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1375 for (i=0; i<am && (pon || pn); i++) { 1376 /* Form one row of AP */ 1377 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1378 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1379 offset = i%dof; 1380 ii = i/dof; 1381 /* If the off diag is empty, we should not do any calculation */ 1382 nzi = po->i[ii+1] - po->i[ii]; 1383 dnzi = pd->i[ii+1] - pd->i[ii]; 1384 if (!nzi && !dnzi) continue; 1385 1386 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr); 1387 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1388 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1389 /* If AP is empty, just continue */ 1390 if (!(htsize+htosize)) continue; 1391 1392 /* Form remote C(ii, :) */ 1393 poj = po->j + po->i[ii]; 1394 for (j=0; j<nzi; j++) { 1395 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr); 1396 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);CHKERRQ(ierr); 1397 } 1398 1399 /* Form local C(ii, :) */ 1400 pdj = pd->j + pd->i[ii]; 1401 for (j=0; j<dnzi; j++) { 1402 ierr = PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);CHKERRQ(ierr); 1403 ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr); 1404 } 1405 } 1406 1407 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1408 1409 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1410 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1411 1412 for (i=0; i<pon; i++) { 1413 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1414 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1415 c_rmtc[i] = htsize; 1416 } 1417 1418 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1419 1420 for (i=0; i<pon; i++) { 1421 off = 0; 1422 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1423 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1424 } 1425 ierr = PetscFree(hta);CHKERRQ(ierr); 1426 1427 ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr); 1428 for (i=0; i<pon; i++) { 1429 owner = 0; lidx = 0; 1430 offset = i%dof; 1431 ii = i/dof; 1432 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr); 1433 iremote[i].index = lidx*dof+offset; 1434 iremote[i].rank = owner; 1435 } 1436 1437 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1438 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1439 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1440 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1441 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1442 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1443 /* How many neighbors have contributions to my rows? */ 1444 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1445 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1446 rootspacesize = 0; 1447 for (i = 0; i < pn; i++) { 1448 rootspacesize += rootdegrees[i]; 1449 } 1450 ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1451 ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1452 /* Get information from leaves 1453 * Number of columns other people contribute to my rows 1454 * */ 1455 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1456 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1457 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1458 ierr = PetscMalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1459 /* The number of columns is received for each row */ 1460 ptap->c_othi[0] = 0; 1461 rootspacesize = 0; 1462 rootspaceoffsets[0] = 0; 1463 for (i = 0; i < pn; i++) { 1464 rcvncols = 0; 1465 for (j = 0; j<rootdegrees[i]; j++) { 1466 rcvncols += rootspace[rootspacesize]; 1467 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1468 rootspacesize++; 1469 } 1470 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1471 } 1472 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1473 1474 ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1475 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1476 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1477 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1478 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1479 1480 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1481 nleaves = 0; 1482 for (i = 0; i<pon; i++) { 1483 owner = 0; 1484 ii = i/dof; 1485 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr); 1486 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1487 for (j=0; j<sendncols; j++) { 1488 iremote[nleaves].rank = owner; 1489 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1490 } 1491 } 1492 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1493 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1494 1495 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1496 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1497 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1498 /* One to one map */ 1499 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1500 /* Get remote data */ 1501 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr); 1502 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1503 ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1504 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1505 pcstart *= dof; 1506 pcend *= dof; 1507 for (i = 0; i < pn; i++) { 1508 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1509 rdj = c_othj + ptap->c_othi[i]; 1510 for (j = 0; j < nzi; j++) { 1511 col = rdj[j]; 1512 /* diag part */ 1513 if (col>=pcstart && col<pcend) { 1514 ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr); 1515 } else { /* off diag */ 1516 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1517 } 1518 } 1519 ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr); 1520 dnz[i] = htsize; 1521 ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr); 1522 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1523 onz[i] = htsize; 1524 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1525 } 1526 1527 ierr = PetscFree2(htd,hto);CHKERRQ(ierr); 1528 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1529 1530 /* local sizes and preallocation */ 1531 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1532 ierr = MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr); 1533 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1534 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1535 1536 /* attach the supporting struct to Cmpi for reuse */ 1537 Cmpi->product->data = ptap; 1538 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1539 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1540 1541 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1542 Cmpi->assembled = PETSC_FALSE; 1543 /* pick an algorithm */ 1544 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1545 alg = 0; 1546 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1547 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1548 switch (alg) { 1549 case 0: 1550 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1551 break; 1552 case 1: 1553 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1554 break; 1555 default: 1556 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm "); 1557 } 1558 PetscFunctionReturn(0); 1559 } 1560 1561 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C) 1562 { 1563 PetscErrorCode ierr; 1564 1565 PetscFunctionBegin; 1566 ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);CHKERRQ(ierr); 1567 PetscFunctionReturn(0); 1568 } 1569 1570 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi) 1571 { 1572 PetscErrorCode ierr; 1573 Mat_APMPI *ptap; 1574 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 1575 MPI_Comm comm; 1576 PetscMPIInt size,rank; 1577 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1578 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 1579 PetscInt *lnk,i,k,pnz,row,nsend; 1580 PetscBT lnkbt; 1581 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,nrecv; 1582 PETSC_UNUSED PetscMPIInt icompleted=0; 1583 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1584 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 1585 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1586 MPI_Request *swaits,*rwaits; 1587 MPI_Status *sstatus,rstatus; 1588 PetscLayout rowmap; 1589 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1590 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 1591 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 1592 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 1593 PetscScalar *apv; 1594 PetscTable ta; 1595 MatType mtype; 1596 const char *prefix; 1597 #if defined(PETSC_USE_INFO) 1598 PetscReal apfill; 1599 #endif 1600 1601 PetscFunctionBegin; 1602 MatCheckProduct(Cmpi,4); 1603 PetscCheckFalse(Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 1604 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1605 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1606 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 1607 1608 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 1609 1610 /* create symbolic parallel matrix Cmpi */ 1611 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1612 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1613 1614 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1615 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1616 1617 /* create struct Mat_APMPI and attached it to C later */ 1618 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1619 ptap->reuse = MAT_INITIAL_MATRIX; 1620 ptap->algType = 1; 1621 1622 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1623 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1624 /* get P_loc by taking all local rows of P */ 1625 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1626 1627 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1628 /* --------------------------------- */ 1629 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1630 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1631 1632 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1633 /* ----------------------------------------------------------------- */ 1634 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1635 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1636 1637 /* create and initialize a linked list */ 1638 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 1639 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 1640 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 1641 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 1642 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1643 1644 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1645 1646 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1647 if (ao) { 1648 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 1649 } else { 1650 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 1651 } 1652 current_space = free_space; 1653 nspacedouble = 0; 1654 1655 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 1656 api[0] = 0; 1657 for (i=0; i<am; i++) { 1658 /* diagonal portion: Ad[i,:]*P */ 1659 ai = ad->i; pi = p_loc->i; 1660 nzi = ai[i+1] - ai[i]; 1661 aj = ad->j + ai[i]; 1662 for (j=0; j<nzi; j++) { 1663 row = aj[j]; 1664 pnz = pi[row+1] - pi[row]; 1665 Jptr = p_loc->j + pi[row]; 1666 /* add non-zero cols of P into the sorted linked list lnk */ 1667 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1668 } 1669 /* off-diagonal portion: Ao[i,:]*P */ 1670 if (ao) { 1671 ai = ao->i; pi = p_oth->i; 1672 nzi = ai[i+1] - ai[i]; 1673 aj = ao->j + ai[i]; 1674 for (j=0; j<nzi; j++) { 1675 row = aj[j]; 1676 pnz = pi[row+1] - pi[row]; 1677 Jptr = p_oth->j + pi[row]; 1678 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1679 } 1680 } 1681 apnz = lnk[0]; 1682 api[i+1] = api[i] + apnz; 1683 if (ap_rmax < apnz) ap_rmax = apnz; 1684 1685 /* if free space is not available, double the total space in the list */ 1686 if (current_space->local_remaining<apnz) { 1687 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1688 nspacedouble++; 1689 } 1690 1691 /* Copy data into free space, then initialize lnk */ 1692 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1693 1694 current_space->array += apnz; 1695 current_space->local_used += apnz; 1696 current_space->local_remaining -= apnz; 1697 } 1698 /* Allocate space for apj and apv, initialize apj, and */ 1699 /* destroy list of free space and other temporary array(s) */ 1700 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 1701 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 1702 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1703 1704 /* Create AP_loc for reuse */ 1705 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 1706 ierr = MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name);CHKERRQ(ierr); 1707 #if defined(PETSC_USE_INFO) 1708 if (ao) { 1709 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 1710 } else { 1711 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 1712 } 1713 ptap->AP_loc->info.mallocs = nspacedouble; 1714 ptap->AP_loc->info.fill_ratio_given = fill; 1715 ptap->AP_loc->info.fill_ratio_needed = apfill; 1716 1717 if (api[am]) { 1718 ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 1719 ierr = PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 1720 } else { 1721 ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 1722 } 1723 #endif 1724 1725 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1726 /* ------------------------------------ */ 1727 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 1728 ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr); 1729 ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 1730 ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr); 1731 ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr); 1732 ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr); 1733 ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_");CHKERRQ(ierr); 1734 ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr); 1735 ierr = MatProductSetAlgorithm(ptap->C_oth,"default");CHKERRQ(ierr); 1736 ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr); 1737 ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr); 1738 ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr); 1739 1740 /* (3) send coj of C_oth to other processors */ 1741 /* ------------------------------------------ */ 1742 /* determine row ownership */ 1743 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 1744 rowmap->n = pn; 1745 rowmap->bs = 1; 1746 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 1747 owners = rowmap->range; 1748 1749 /* determine the number of messages to send, their lengths */ 1750 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 1751 ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr); 1752 ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr); 1753 1754 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 1755 coi = c_oth->i; coj = c_oth->j; 1756 con = ptap->C_oth->rmap->n; 1757 proc = 0; 1758 for (i=0; i<con; i++) { 1759 while (prmap[i] >= owners[proc+1]) proc++; 1760 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1761 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1762 } 1763 1764 len = 0; /* max length of buf_si[], see (4) */ 1765 owners_co[0] = 0; 1766 nsend = 0; 1767 for (proc=0; proc<size; proc++) { 1768 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1769 if (len_s[proc]) { 1770 nsend++; 1771 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1772 len += len_si[proc]; 1773 } 1774 } 1775 1776 /* determine the number and length of messages to receive for coi and coj */ 1777 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 1778 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 1779 1780 /* post the Irecv and Isend of coj */ 1781 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1782 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1783 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 1784 for (proc=0, k=0; proc<size; proc++) { 1785 if (!len_s[proc]) continue; 1786 i = owners_co[proc]; 1787 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRMPI(ierr); 1788 k++; 1789 } 1790 1791 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1792 /* ---------------------------------------- */ 1793 ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr); 1794 ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 1795 ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr); 1796 ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr); 1797 ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr); 1798 ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_");CHKERRQ(ierr); 1799 ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr); 1800 ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr); 1801 ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr); 1802 ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr); 1803 ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr); 1804 1805 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 1806 1807 /* receives coj are complete */ 1808 for (i=0; i<nrecv; i++) { 1809 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr); 1810 } 1811 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1812 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);} 1813 1814 /* add received column indices into ta to update Crmax */ 1815 for (k=0; k<nrecv; k++) {/* k-th received message */ 1816 Jptr = buf_rj[k]; 1817 for (j=0; j<len_r[k]; j++) { 1818 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 1819 } 1820 } 1821 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 1822 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 1823 1824 /* (4) send and recv coi */ 1825 /*-----------------------*/ 1826 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1827 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1828 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 1829 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1830 for (proc=0,k=0; proc<size; proc++) { 1831 if (!len_s[proc]) continue; 1832 /* form outgoing message for i-structure: 1833 buf_si[0]: nrows to be sent 1834 [1:nrows]: row index (global) 1835 [nrows+1:2*nrows+1]: i-structure index 1836 */ 1837 /*-------------------------------------------*/ 1838 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 1839 buf_si_i = buf_si + nrows+1; 1840 buf_si[0] = nrows; 1841 buf_si_i[0] = 0; 1842 nrows = 0; 1843 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1844 nzi = coi[i+1] - coi[i]; 1845 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1846 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1847 nrows++; 1848 } 1849 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRMPI(ierr); 1850 k++; 1851 buf_si += len_si[proc]; 1852 } 1853 for (i=0; i<nrecv; i++) { 1854 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr); 1855 } 1856 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1857 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);} 1858 1859 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 1860 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1861 ierr = PetscFree(swaits);CHKERRQ(ierr); 1862 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1863 1864 /* (5) compute the local portion of Cmpi */ 1865 /* ------------------------------------------ */ 1866 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1867 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 1868 current_space = free_space; 1869 1870 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 1871 for (k=0; k<nrecv; k++) { 1872 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1873 nrows = *buf_ri_k[k]; 1874 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1875 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1876 } 1877 1878 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 1879 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1880 for (i=0; i<pn; i++) { 1881 /* add C_loc into Cmpi */ 1882 nzi = c_loc->i[i+1] - c_loc->i[i]; 1883 Jptr = c_loc->j + c_loc->i[i]; 1884 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1885 1886 /* add received col data into lnk */ 1887 for (k=0; k<nrecv; k++) { /* k-th received message */ 1888 if (i == *nextrow[k]) { /* i-th row */ 1889 nzi = *(nextci[k]+1) - *nextci[k]; 1890 Jptr = buf_rj[k] + *nextci[k]; 1891 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1892 nextrow[k]++; nextci[k]++; 1893 } 1894 } 1895 nzi = lnk[0]; 1896 1897 /* copy data into free space, then initialize lnk */ 1898 ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1899 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 1900 } 1901 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1902 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1903 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 1904 1905 /* local sizes and preallocation */ 1906 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1907 if (P->cmap->bs > 0) { 1908 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 1909 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 1910 } 1911 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1912 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1913 1914 /* members in merge */ 1915 ierr = PetscFree(id_r);CHKERRQ(ierr); 1916 ierr = PetscFree(len_r);CHKERRQ(ierr); 1917 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 1918 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 1919 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 1920 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 1921 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 1922 1923 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 1924 1925 /* attach the supporting struct to Cmpi for reuse */ 1926 Cmpi->product->data = ptap; 1927 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1928 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1929 1930 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1931 Cmpi->assembled = PETSC_FALSE; 1932 PetscFunctionReturn(0); 1933 } 1934 1935 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 1936 { 1937 PetscErrorCode ierr; 1938 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 1939 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1940 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 1941 Mat_APMPI *ptap; 1942 Mat AP_loc,C_loc,C_oth; 1943 PetscInt i,rstart,rend,cm,ncols,row; 1944 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 1945 PetscScalar *apa; 1946 const PetscInt *cols; 1947 const PetscScalar *vals; 1948 1949 PetscFunctionBegin; 1950 MatCheckProduct(C,3); 1951 ptap = (Mat_APMPI*)C->product->data; 1952 PetscCheckFalse(!ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 1953 PetscCheckFalse(!ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 1954 1955 ierr = MatZeroEntries(C);CHKERRQ(ierr); 1956 /* 1) get R = Pd^T,Ro = Po^T */ 1957 if (ptap->reuse == MAT_REUSE_MATRIX) { 1958 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1959 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1960 } 1961 1962 /* 2) get AP_loc */ 1963 AP_loc = ptap->AP_loc; 1964 ap = (Mat_SeqAIJ*)AP_loc->data; 1965 1966 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1967 /*-----------------------------------------------------*/ 1968 if (ptap->reuse == MAT_REUSE_MATRIX) { 1969 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1970 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1971 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1972 } 1973 1974 /* 2-2) compute numeric A_loc*P - dominating part */ 1975 /* ---------------------------------------------- */ 1976 /* get data from symbolic products */ 1977 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1978 if (ptap->P_oth) { 1979 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1980 } 1981 apa = ptap->apa; 1982 api = ap->i; 1983 apj = ap->j; 1984 for (i=0; i<am; i++) { 1985 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1986 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 1987 apnz = api[i+1] - api[i]; 1988 for (j=0; j<apnz; j++) { 1989 col = apj[j+api[i]]; 1990 ap->a[j+ap->i[i]] = apa[col]; 1991 apa[col] = 0.0; 1992 } 1993 } 1994 /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */ 1995 ierr = PetscObjectStateIncrease((PetscObject)AP_loc);CHKERRQ(ierr); 1996 1997 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1998 ierr = MatProductNumeric(ptap->C_loc);CHKERRQ(ierr); 1999 ierr = MatProductNumeric(ptap->C_oth);CHKERRQ(ierr); 2000 C_loc = ptap->C_loc; 2001 C_oth = ptap->C_oth; 2002 2003 /* add C_loc and Co to to C */ 2004 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 2005 2006 /* C_loc -> C */ 2007 cm = C_loc->rmap->N; 2008 c_seq = (Mat_SeqAIJ*)C_loc->data; 2009 cols = c_seq->j; 2010 vals = c_seq->a; 2011 2012 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 2013 /* when there are no off-processor parts. */ 2014 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 2015 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 2016 /* a table, and other, more complex stuff has to be done. */ 2017 if (C->assembled) { 2018 C->was_assembled = PETSC_TRUE; 2019 C->assembled = PETSC_FALSE; 2020 } 2021 if (C->was_assembled) { 2022 for (i=0; i<cm; i++) { 2023 ncols = c_seq->i[i+1] - c_seq->i[i]; 2024 row = rstart + i; 2025 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2026 cols += ncols; vals += ncols; 2027 } 2028 } else { 2029 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 2030 } 2031 2032 /* Co -> C, off-processor part */ 2033 cm = C_oth->rmap->N; 2034 c_seq = (Mat_SeqAIJ*)C_oth->data; 2035 cols = c_seq->j; 2036 vals = c_seq->a; 2037 for (i=0; i<cm; i++) { 2038 ncols = c_seq->i[i+1] - c_seq->i[i]; 2039 row = p->garray[i]; 2040 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2041 cols += ncols; vals += ncols; 2042 } 2043 2044 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2045 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2046 2047 ptap->reuse = MAT_REUSE_MATRIX; 2048 PetscFunctionReturn(0); 2049 } 2050 2051 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) 2052 { 2053 PetscErrorCode ierr; 2054 Mat_Product *product = C->product; 2055 Mat A=product->A,P=product->B; 2056 MatProductAlgorithm alg=product->alg; 2057 PetscReal fill=product->fill; 2058 PetscBool flg; 2059 2060 PetscFunctionBegin; 2061 /* scalable: do R=P^T locally, then C=R*A*P */ 2062 ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr); 2063 if (flg) { 2064 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C);CHKERRQ(ierr); 2065 C->ops->productnumeric = MatProductNumeric_PtAP; 2066 goto next; 2067 } 2068 2069 /* nonscalable: do R=P^T locally, then C=R*A*P */ 2070 ierr = PetscStrcmp(alg,"nonscalable",&flg);CHKERRQ(ierr); 2071 if (flg) { 2072 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); 2073 goto next; 2074 } 2075 2076 /* allatonce */ 2077 ierr = PetscStrcmp(alg,"allatonce",&flg);CHKERRQ(ierr); 2078 if (flg) { 2079 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr); 2080 goto next; 2081 } 2082 2083 /* allatonce_merged */ 2084 ierr = PetscStrcmp(alg,"allatonce_merged",&flg);CHKERRQ(ierr); 2085 if (flg) { 2086 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr); 2087 goto next; 2088 } 2089 2090 /* backend general code */ 2091 ierr = PetscStrcmp(alg,"backend",&flg);CHKERRQ(ierr); 2092 if (flg) { 2093 ierr = MatProductSymbolic_MPIAIJBACKEND(C);CHKERRQ(ierr); 2094 PetscFunctionReturn(0); 2095 } 2096 2097 /* hypre */ 2098 #if defined(PETSC_HAVE_HYPRE) 2099 ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr); 2100 if (flg) { 2101 ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr); 2102 C->ops->productnumeric = MatProductNumeric_PtAP; 2103 PetscFunctionReturn(0); 2104 } 2105 #endif 2106 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 2107 2108 next: 2109 C->ops->productnumeric = MatProductNumeric_PtAP; 2110 PetscFunctionReturn(0); 2111 } 2112