1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 cudaError_t cerr; 29 30 PetscFunctionBegin; 31 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 32 if (cusp->coo_p && v) { 33 thrust::device_ptr<const PetscScalar> d_v; 34 THRUSTARRAY *w = NULL; 35 36 if (isCudaMem(v)) { 37 d_v = thrust::device_pointer_cast(v); 38 } else { 39 w = new THRUSTARRAY(n); 40 w->assign(v,v+n); 41 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 42 d_v = w->data(); 43 } 44 45 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 46 cusp->coo_pw->begin())); 47 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 48 cusp->coo_pw->end())); 49 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 50 thrust::for_each(zibit,zieit,VecCUDAEquals()); 51 cerr = WaitForCUDA();CHKERRCUDA(cerr); 52 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 53 delete w; 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 55 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 56 } else { 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 58 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 59 } 60 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 61 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 62 A->num_ass++; 63 A->assembled = PETSC_TRUE; 64 A->ass_nonzerostate = A->nonzerostate; 65 A->offloadmask = PETSC_OFFLOAD_GPU; 66 PetscFunctionReturn(0); 67 } 68 69 template <typename Tuple> 70 struct IsNotOffDiagT 71 { 72 PetscInt _cstart,_cend; 73 74 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 75 __host__ __device__ 76 inline bool operator()(Tuple t) 77 { 78 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 79 } 80 }; 81 82 struct IsOffDiag 83 { 84 PetscInt _cstart,_cend; 85 86 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 87 __host__ __device__ 88 inline bool operator() (const PetscInt &c) 89 { 90 return c < _cstart || c >= _cend; 91 } 92 }; 93 94 struct GlobToLoc 95 { 96 PetscInt _start; 97 98 GlobToLoc(PetscInt start) : _start(start) {} 99 __host__ __device__ 100 inline PetscInt operator() (const PetscInt &c) 101 { 102 return c - _start; 103 } 104 }; 105 106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 107 { 108 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 109 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 110 PetscErrorCode ierr; 111 PetscInt *jj; 112 size_t noff = 0; 113 THRUSTINTARRAY d_i(n); 114 THRUSTINTARRAY d_j(n); 115 ISLocalToGlobalMapping l2g; 116 cudaError_t cerr; 117 118 PetscFunctionBegin; 119 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 120 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 121 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 122 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 123 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 124 ierr = PetscFree(b->garray);CHKERRQ(ierr); 125 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 126 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 127 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 128 129 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 130 d_i.assign(coo_i,coo_i+n); 131 d_j.assign(coo_j,coo_j+n); 132 delete cusp->coo_p; 133 delete cusp->coo_pw; 134 cusp->coo_p = NULL; 135 cusp->coo_pw = NULL; 136 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 137 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 138 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 139 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 140 cusp->coo_p = new THRUSTINTARRAY(n); 141 cusp->coo_pw = new THRUSTARRAY(n); 142 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 143 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 144 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 145 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 146 firstoffd = mzipp.get_iterator_tuple().get<1>(); 147 } 148 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 149 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 150 151 /* from global to local */ 152 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 153 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 154 cerr = WaitForCUDA();CHKERRCUDA(cerr); 155 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 156 157 /* copy offdiag column indices to map on the CPU */ 158 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 159 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 160 auto o_j = d_j.begin(); 161 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 162 thrust::advance(o_j,cusp->coo_nd); 163 thrust::sort(thrust::device,o_j,d_j.end()); 164 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 165 cerr = WaitForCUDA();CHKERRCUDA(cerr); 166 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 167 noff = thrust::distance(o_j,wit); 168 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 169 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 170 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 171 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 172 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 173 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 174 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 175 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 176 177 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 178 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 179 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 180 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 181 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 182 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 183 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 184 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 185 186 /* GPU memory, cusparse specific call handles it internally */ 187 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 188 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 189 ierr = PetscFree(jj);CHKERRQ(ierr); 190 191 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 192 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 193 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 194 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 195 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 196 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 197 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 198 B->preallocated = PETSC_TRUE; 199 B->nonzerostate++; 200 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 201 202 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 203 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 204 B->offloadmask = PETSC_OFFLOAD_CPU; 205 B->assembled = PETSC_FALSE; 206 B->was_assembled = PETSC_FALSE; 207 PetscFunctionReturn(0); 208 } 209 210 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 211 { 212 Mat Ad,Ao; 213 const PetscInt *cmap; 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 218 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 219 if (glob) { 220 PetscInt cst, i, dn, on, *gidx; 221 222 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 223 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 224 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 225 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 226 for (i=0; i<dn; i++) gidx[i] = cst + i; 227 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 228 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 229 } 230 PetscFunctionReturn(0); 231 } 232 233 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 234 { 235 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 236 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 237 PetscErrorCode ierr; 238 PetscInt i; 239 240 PetscFunctionBegin; 241 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 242 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 243 if (PetscDefined(USE_DEBUG) && d_nnz) { 244 for (i=0; i<B->rmap->n; i++) { 245 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 246 } 247 } 248 if (PetscDefined(USE_DEBUG) && o_nnz) { 249 for (i=0; i<B->rmap->n; i++) { 250 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 251 } 252 } 253 #if defined(PETSC_USE_CTABLE) 254 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 255 #else 256 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 257 #endif 258 ierr = PetscFree(b->garray);CHKERRQ(ierr); 259 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 260 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 261 /* Because the B will have been resized we simply destroy it and create a new one each time */ 262 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 263 if (!b->A) { 264 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 265 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 266 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 267 } 268 if (!b->B) { 269 PetscMPIInt size; 270 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr); 271 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 272 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 273 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 274 } 275 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 276 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 277 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 278 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 279 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 280 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 281 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 282 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 283 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 284 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 285 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 286 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 287 288 B->preallocated = PETSC_TRUE; 289 PetscFunctionReturn(0); 290 } 291 292 typedef struct { 293 Mat *mp; /* intermediate products */ 294 PetscBool *mptmp; /* is the intermediate product temporary ? */ 295 PetscInt cp; /* number of intermediate products */ 296 297 /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */ 298 PetscInt *startsj_s,*startsj_r; 299 PetscScalar *bufa; 300 Mat P_oth; 301 302 /* may take advantage of merging product->B */ 303 Mat Bloc; 304 305 /* cusparse does not have support to split between symbolic and numeric phases 306 When api_user is true, we don't need to update the numerical values 307 of the temporary storage */ 308 PetscBool reusesym; 309 310 /* support for COO values insertion */ 311 PetscScalar *coo_v,*coo_w; 312 PetscSF sf; /* if present, non-local values insertion (i.e. AtB or PtAP) */ 313 } MatMatMPIAIJCUSPARSE; 314 315 PetscErrorCode MatDestroy_MatMatMPIAIJCUSPARSE(void *data) 316 { 317 MatMatMPIAIJCUSPARSE *mmdata = (MatMatMPIAIJCUSPARSE*)data; 318 PetscInt i; 319 PetscErrorCode ierr; 320 321 PetscFunctionBegin; 322 ierr = PetscFree2(mmdata->startsj_s,mmdata->startsj_r);CHKERRQ(ierr); 323 ierr = PetscFree(mmdata->bufa);CHKERRQ(ierr); 324 ierr = PetscFree(mmdata->coo_v);CHKERRQ(ierr); 325 ierr = PetscFree(mmdata->coo_w);CHKERRQ(ierr); 326 ierr = MatDestroy(&mmdata->P_oth);CHKERRQ(ierr); 327 ierr = MatDestroy(&mmdata->Bloc);CHKERRQ(ierr); 328 ierr = PetscSFDestroy(&mmdata->sf);CHKERRQ(ierr); 329 for (i = 0; i < mmdata->cp; i++) { 330 ierr = MatDestroy(&mmdata->mp[i]);CHKERRQ(ierr); 331 } 332 ierr = PetscFree(mmdata->mp);CHKERRQ(ierr); 333 ierr = PetscFree(mmdata->mptmp);CHKERRQ(ierr); 334 ierr = PetscFree(mmdata);CHKERRQ(ierr); 335 PetscFunctionReturn(0); 336 } 337 338 static PetscErrorCode MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C) 339 { 340 MatMatMPIAIJCUSPARSE *mmdata; 341 PetscScalar *tmp; 342 PetscInt i,n; 343 PetscErrorCode ierr; 344 345 PetscFunctionBegin; 346 MatCheckProduct(C,1); 347 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 348 mmdata = (MatMatMPIAIJCUSPARSE*)C->product->data; 349 tmp = mmdata->sf ? mmdata->coo_w : mmdata->coo_v; 350 if (!mmdata->reusesym) { /* update temporary matrices */ 351 if (mmdata->P_oth) { 352 ierr = MatGetBrowsOfAoCols_MPIAIJ(C->product->A,C->product->B,MAT_REUSE_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 353 } 354 if (mmdata->Bloc) { 355 ierr = MatMPIAIJGetLocalMatMerge(C->product->B,MAT_REUSE_MATRIX,NULL,&mmdata->Bloc);CHKERRQ(ierr); 356 } 357 } 358 mmdata->reusesym = PETSC_FALSE; 359 for (i = 0, n = 0; i < mmdata->cp; i++) { 360 Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mmdata->mp[i]->data; 361 const PetscScalar *vv; 362 363 if (!mmdata->mp[i]->ops->productnumeric) SETERRQ1(PetscObjectComm((PetscObject)mmdata->mp[i]),PETSC_ERR_PLIB,"Missing numeric op for %s",MatProductTypes[mmdata->mp[i]->product->type]); 364 ierr = (*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]);CHKERRQ(ierr); 365 if (mmdata->mptmp[i]) continue; 366 /* TODO: add support for using GPU data directly */ 367 ierr = MatSeqAIJGetArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr); 368 ierr = PetscArraycpy(tmp + n,vv,mm->nz);CHKERRQ(ierr); 369 ierr = MatSeqAIJRestoreArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr); 370 n += mm->nz; 371 } 372 if (mmdata->sf) { /* offprocess insertion */ 373 ierr = PetscSFGatherBegin(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr); 374 ierr = PetscSFGatherEnd(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr); 375 } 376 ierr = MatSetValuesCOO(C,mmdata->coo_v,INSERT_VALUES);CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 /* Pt * A or A * P */ 381 #define MAX_NUMBER_INTERMEDIATE 4 382 static PetscErrorCode MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C) 383 { 384 Mat_Product *product = C->product; 385 Mat A,P,mp[MAX_NUMBER_INTERMEDIATE]; 386 Mat_MPIAIJ *a,*p; 387 MatMatMPIAIJCUSPARSE *mmdata; 388 ISLocalToGlobalMapping P_oth_l2g = NULL; 389 IS glob = NULL; 390 const PetscInt *globidx,*P_oth_idx; 391 const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE],*rmapa[MAX_NUMBER_INTERMEDIATE]; 392 PetscInt cp = 0,m,n,M,N,ncoo,*coo_i,*coo_j,cmapt[MAX_NUMBER_INTERMEDIATE],rmapt[MAX_NUMBER_INTERMEDIATE],i,j; 393 MatProductType ptype; 394 PetscBool mptmp[MAX_NUMBER_INTERMEDIATE],hasoffproc = PETSC_FALSE; 395 PetscMPIInt size; 396 PetscErrorCode ierr; 397 398 PetscFunctionBegin; 399 MatCheckProduct(C,1); 400 if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 401 ptype = product->type; 402 if (product->A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 403 switch (ptype) { 404 case MATPRODUCT_AB: 405 A = product->A; 406 P = product->B; 407 m = A->rmap->n; 408 n = P->cmap->n; 409 M = A->rmap->N; 410 N = P->cmap->N; 411 break; 412 case MATPRODUCT_AtB: 413 P = product->A; 414 A = product->B; 415 m = P->cmap->n; 416 n = A->cmap->n; 417 M = P->cmap->N; 418 N = A->cmap->N; 419 hasoffproc = PETSC_TRUE; 420 break; 421 case MATPRODUCT_PtAP: 422 A = product->A; 423 P = product->B; 424 m = P->cmap->n; 425 n = P->cmap->n; 426 M = P->cmap->N; 427 N = P->cmap->N; 428 hasoffproc = PETSC_TRUE; 429 break; 430 default: 431 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 432 } 433 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)C),&size);CHKERRQ(ierr); 434 if (size == 1) hasoffproc = PETSC_FALSE; 435 436 for (i=0;i<MAX_NUMBER_INTERMEDIATE;i++) { 437 mp[i] = NULL; 438 mptmp[i] = PETSC_FALSE; 439 rmapt[i] = 0; 440 cmapt[i] = 0; 441 rmapa[i] = NULL; 442 cmapa[i] = NULL; 443 } 444 445 a = (Mat_MPIAIJ*)A->data; 446 p = (Mat_MPIAIJ*)P->data; 447 ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr); 448 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 449 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 450 ierr = MatSetType(C,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 451 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 452 mmdata->reusesym = product->api_user; 453 switch (ptype) { 454 case MATPRODUCT_AB: /* A * P */ 455 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 456 457 if (1) { /* A_diag * P_loc and A_off * P_oth TODO: add customization for this */ 458 /* P is product->B */ 459 ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 460 ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 461 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 462 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 463 mp[cp]->product->api_user = product->api_user; 464 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 465 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 466 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 467 ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 468 rmapt[cp] = 1; 469 cmapt[cp] = 2; 470 cmapa[cp] = globidx; 471 mptmp[cp] = PETSC_FALSE; 472 cp++; 473 } else { /* A_diag * P_diag and A_diag * P_off and A_off * P_oth */ 474 ierr = MatProductCreate(a->A,p->A,NULL,&mp[cp]);CHKERRQ(ierr); 475 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 476 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 477 mp[cp]->product->api_user = product->api_user; 478 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 479 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 480 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 481 rmapt[cp] = 1; 482 cmapt[cp] = 1; 483 mptmp[cp] = PETSC_FALSE; 484 cp++; 485 ierr = MatProductCreate(a->A,p->B,NULL,&mp[cp]);CHKERRQ(ierr); 486 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 487 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 488 mp[cp]->product->api_user = product->api_user; 489 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 490 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 491 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 492 rmapt[cp] = 1; 493 cmapt[cp] = 2; 494 cmapa[cp] = p->garray; 495 mptmp[cp] = PETSC_FALSE; 496 cp++; 497 } 498 if (mmdata->P_oth) { 499 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr); 500 ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 501 ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr); 502 ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr); 503 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 504 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 505 mp[cp]->product->api_user = product->api_user; 506 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 507 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 508 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 509 rmapt[cp] = 1; 510 cmapt[cp] = 2; 511 cmapa[cp] = P_oth_idx; 512 mptmp[cp] = PETSC_FALSE; 513 cp++; 514 } 515 break; 516 case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */ 517 /* A is product->B */ 518 ierr = MatMPIAIJGetLocalMatMerge(A,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 519 ierr = MatProductCreate(p->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 520 ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 521 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 522 mp[cp]->product->api_user = product->api_user; 523 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 524 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 525 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 526 ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 527 rmapt[cp] = 1; 528 cmapt[cp] = 2; 529 cmapa[cp] = globidx; 530 mptmp[cp] = PETSC_FALSE; 531 cp++; 532 ierr = MatProductCreate(p->B,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 533 ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 534 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 535 mp[cp]->product->api_user = product->api_user; 536 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 537 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 538 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 539 rmapt[cp] = 2; 540 rmapa[cp] = p->garray; 541 cmapt[cp] = 2; 542 cmapa[cp] = globidx; 543 mptmp[cp] = PETSC_FALSE; 544 cp++; 545 break; 546 case MATPRODUCT_PtAP: 547 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 548 /* P is product->B */ 549 ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 550 ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 551 ierr = MatProductSetType(mp[cp],MATPRODUCT_PtAP);CHKERRQ(ierr); 552 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 553 mp[cp]->product->api_user = product->api_user; 554 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 555 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 556 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 557 ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 558 rmapt[cp] = 2; 559 rmapa[cp] = globidx; 560 cmapt[cp] = 2; 561 cmapa[cp] = globidx; 562 mptmp[cp] = PETSC_FALSE; 563 cp++; 564 if (mmdata->P_oth) { 565 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr); 566 ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr); 567 ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 568 ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr); 569 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 570 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 571 mp[cp]->product->api_user = product->api_user; 572 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 573 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 574 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 575 mptmp[cp] = PETSC_TRUE; 576 cp++; 577 ierr = MatProductCreate(mmdata->Bloc,mp[1],NULL,&mp[cp]);CHKERRQ(ierr); 578 ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 579 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 580 mp[cp]->product->api_user = product->api_user; 581 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 582 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 583 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 584 rmapt[cp] = 2; 585 rmapa[cp] = globidx; 586 cmapt[cp] = 2; 587 cmapa[cp] = P_oth_idx; 588 mptmp[cp] = PETSC_FALSE; 589 cp++; 590 } 591 break; 592 default: 593 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 594 } 595 ierr = PetscMalloc1(cp,&mmdata->mp);CHKERRQ(ierr); 596 for (i = 0; i < cp; i++) mmdata->mp[i] = mp[i]; 597 ierr = PetscMalloc1(cp,&mmdata->mptmp);CHKERRQ(ierr); 598 for (i = 0; i < cp; i++) mmdata->mptmp[i] = mptmp[i]; 599 mmdata->cp = cp; 600 C->product->data = mmdata; 601 C->product->destroy = MatDestroy_MatMatMPIAIJCUSPARSE; 602 C->ops->productnumeric = MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE; 603 604 /* prepare coo coordinates for values insertion */ 605 ncoo = 0; 606 for (cp = 0; cp < mmdata->cp; cp++) { 607 Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data; 608 if (mptmp[cp]) continue; 609 ncoo += mm->nz; 610 } 611 ierr = PetscMalloc2(ncoo,&coo_i,ncoo,&coo_j);CHKERRQ(ierr); 612 ncoo = 0; 613 for (cp = 0; cp < mmdata->cp; cp++) { 614 Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data; 615 PetscInt *coi = coo_i + ncoo; 616 PetscInt *coj = coo_j + ncoo; 617 const PetscInt mr = mp[cp]->rmap->n; 618 const PetscInt *jj = mm->j; 619 const PetscInt *ii = mm->i; 620 621 if (mptmp[cp]) continue; 622 /* rows coo */ 623 if (!rmapt[cp]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"This should not happen"); 624 else if (rmapt[cp] == 1) { /* local to global for owned rows of */ 625 const PetscInt rs = C->rmap->rstart; 626 for (i = 0; i < mr; i++) { 627 const PetscInt gr = i + rs; 628 for (j = ii[i]; j < ii[i+1]; j++) { 629 coi[j] = gr; 630 } 631 } 632 } else { /* offprocess */ 633 const PetscInt *rmap = rmapa[cp]; 634 for (i = 0; i < mr; i++) { 635 const PetscInt gr = rmap[i]; 636 for (j = ii[i]; j < ii[i+1]; j++) { 637 coi[j] = gr; 638 } 639 } 640 } 641 /* columns coo */ 642 if (!cmapt[cp]) { 643 ierr = PetscArraycpy(coj,jj,mm->nz);CHKERRQ(ierr); 644 } else if (cmapt[cp] == 1) { /* local to global for owned columns of P */ 645 const PetscInt cs = P->cmap->rstart; 646 for (j = 0; j < mm->nz; j++) { 647 coj[j] = jj[j] + cs; 648 } 649 } else { /* offdiag */ 650 const PetscInt *cmap = cmapa[cp]; 651 for (j = 0; j < mm->nz; j++) { 652 coj[j] = cmap[jj[j]]; 653 } 654 } 655 ncoo += mm->nz; 656 } 657 if (glob) { 658 ierr = ISRestoreIndices(glob,&globidx);CHKERRQ(ierr); 659 } 660 ierr = ISDestroy(&glob);CHKERRQ(ierr); 661 if (P_oth_l2g) { 662 ierr = ISLocalToGlobalMappingRestoreIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 663 } 664 ierr = ISLocalToGlobalMappingDestroy(&P_oth_l2g);CHKERRQ(ierr); 665 666 if (hasoffproc) { /* offproc values insertion */ 667 const PetscInt *sfdeg; 668 const PetscInt n = P->cmap->n; 669 PetscInt ncoo2,*coo_i2,*coo_j2; 670 671 ierr = PetscSFCreate(PetscObjectComm((PetscObject)C),&mmdata->sf);CHKERRQ(ierr); 672 ierr = PetscSFSetGraphLayout(mmdata->sf,P->cmap,ncoo,NULL,PETSC_OWN_POINTER,coo_i);CHKERRQ(ierr); 673 ierr = PetscSFComputeDegreeBegin(mmdata->sf,&sfdeg);CHKERRQ(ierr); 674 ierr = PetscSFComputeDegreeEnd(mmdata->sf,&sfdeg);CHKERRQ(ierr); 675 for (i = 0, ncoo2 = 0; i < n; i++) ncoo2 += sfdeg[i]; 676 ierr = PetscMalloc2(ncoo2,&coo_i2,ncoo2,&coo_j2);CHKERRQ(ierr); 677 ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr); 678 ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr); 679 ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr); 680 ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr); 681 ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr); 682 ierr = PetscMalloc1(ncoo,&mmdata->coo_w);CHKERRQ(ierr); 683 coo_i = coo_i2; 684 coo_j = coo_j2; 685 ncoo = ncoo2; 686 } 687 688 /* preallocate with COO data */ 689 ierr = MatSetPreallocationCOO(C,ncoo,coo_i,coo_j);CHKERRQ(ierr); 690 ierr = PetscMalloc1(ncoo,&mmdata->coo_v);CHKERRQ(ierr); 691 ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr); 692 PetscFunctionReturn(0); 693 } 694 695 static PetscErrorCode MatProductSetFromOptions_MPIAIJCUSPARSE(Mat mat) 696 { 697 Mat_Product *product = mat->product; 698 PetscErrorCode ierr; 699 PetscBool Biscusp = PETSC_FALSE; 700 701 PetscFunctionBegin; 702 MatCheckProduct(mat,1); 703 if (!product->B->boundtocpu) { 704 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATMPIAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 705 } 706 if (Biscusp) { 707 switch (product->type) { 708 case MATPRODUCT_AB: 709 case MATPRODUCT_AtB: 710 case MATPRODUCT_PtAP: 711 mat->ops->productsymbolic = MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE; 712 break; 713 default: 714 break; 715 } 716 } 717 if (!mat->ops->productsymbolic) { 718 ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr); 719 } 720 PetscFunctionReturn(0); 721 } 722 723 /*@ 724 MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 725 726 Not collective 727 728 Input Parameters: 729 + A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE 730 - gen - the boolean flag 731 732 Level: intermediate 733 734 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE 735 @*/ 736 PetscErrorCode MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen) 737 { 738 PetscErrorCode ierr; 739 PetscBool ismpiaij; 740 741 PetscFunctionBegin; 742 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 743 MatCheckPreallocated(A,1); 744 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr); 745 if (ismpiaij) { 746 Mat A_d,A_o; 747 748 ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr); 749 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr); 750 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr); 751 } else { 752 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr); 753 } 754 PetscFunctionReturn(0); 755 } 756 757 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 758 { 759 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 760 PetscErrorCode ierr; 761 PetscInt nt; 762 763 PetscFunctionBegin; 764 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 765 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 766 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 767 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 768 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 769 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 770 PetscFunctionReturn(0); 771 } 772 773 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 774 { 775 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 776 PetscErrorCode ierr; 777 778 PetscFunctionBegin; 779 if (A->factortype == MAT_FACTOR_NONE) { 780 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 781 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 782 if (d_mat) { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 784 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 785 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 786 cudaError_t err; 787 PetscScalar *vals; 788 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 789 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 790 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 791 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 792 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 793 } 794 } 795 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 796 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 797 798 PetscFunctionReturn(0); 799 } 800 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 801 { 802 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 803 PetscErrorCode ierr; 804 PetscInt nt; 805 806 PetscFunctionBegin; 807 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 808 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 809 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 810 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 811 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 812 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 813 PetscFunctionReturn(0); 814 } 815 816 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 817 { 818 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 819 PetscErrorCode ierr; 820 PetscInt nt; 821 822 PetscFunctionBegin; 823 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 824 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 825 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 826 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 827 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 828 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 829 PetscFunctionReturn(0); 830 } 831 832 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 833 { 834 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 835 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 836 837 PetscFunctionBegin; 838 switch (op) { 839 case MAT_CUSPARSE_MULT_DIAG: 840 cusparseStruct->diagGPUMatFormat = format; 841 break; 842 case MAT_CUSPARSE_MULT_OFFDIAG: 843 cusparseStruct->offdiagGPUMatFormat = format; 844 break; 845 case MAT_CUSPARSE_ALL: 846 cusparseStruct->diagGPUMatFormat = format; 847 cusparseStruct->offdiagGPUMatFormat = format; 848 break; 849 default: 850 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 851 } 852 PetscFunctionReturn(0); 853 } 854 855 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 856 { 857 MatCUSPARSEStorageFormat format; 858 PetscErrorCode ierr; 859 PetscBool flg; 860 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 861 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 862 863 PetscFunctionBegin; 864 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 865 if (A->factortype==MAT_FACTOR_NONE) { 866 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 867 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 868 if (flg) { 869 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 870 } 871 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 872 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 873 if (flg) { 874 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 875 } 876 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 877 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 878 if (flg) { 879 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 880 } 881 } 882 ierr = PetscOptionsTail();CHKERRQ(ierr); 883 PetscFunctionReturn(0); 884 } 885 886 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 887 { 888 PetscErrorCode ierr; 889 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 890 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 891 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 892 PetscFunctionBegin; 893 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 894 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 895 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 896 } 897 if (d_mat) { 898 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 899 } 900 901 PetscFunctionReturn(0); 902 } 903 904 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 905 { 906 PetscErrorCode ierr; 907 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 908 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 909 cudaError_t err; 910 cusparseStatus_t stat; 911 912 PetscFunctionBegin; 913 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 914 if (cusparseStruct->deviceMat) { 915 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 916 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 917 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 918 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 919 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 920 if (jaca->compressedrow.use) { 921 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 922 } 923 if (jacb->compressedrow.use) { 924 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 925 } 926 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 927 err = cudaFree(d_mat);CHKERRCUDA(err); 928 } 929 try { 930 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 931 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 932 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 933 if (cusparseStruct->stream) { 934 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 935 } 936 delete cusparseStruct->coo_p; 937 delete cusparseStruct->coo_pw; 938 delete cusparseStruct; 939 } catch(char *ex) { 940 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 941 } 942 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 943 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 944 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 945 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 946 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 947 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 948 PetscFunctionReturn(0); 949 } 950 951 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 952 { 953 PetscErrorCode ierr; 954 Mat_MPIAIJ *a; 955 Mat_MPIAIJCUSPARSE *cusparseStruct; 956 cusparseStatus_t stat; 957 Mat A; 958 959 PetscFunctionBegin; 960 if (reuse == MAT_INITIAL_MATRIX) { 961 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 962 } else if (reuse == MAT_REUSE_MATRIX) { 963 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 964 } 965 A = *newmat; 966 967 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 968 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 969 970 a = (Mat_MPIAIJ*)A->data; 971 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 972 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 973 if (a->lvec) { 974 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 975 } 976 977 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 978 a->spptr = new Mat_MPIAIJCUSPARSE; 979 980 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 981 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 982 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 983 cusparseStruct->coo_p = NULL; 984 cusparseStruct->coo_pw = NULL; 985 cusparseStruct->stream = 0; 986 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 987 cusparseStruct->deviceMat = NULL; 988 } 989 990 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 991 A->ops->mult = MatMult_MPIAIJCUSPARSE; 992 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 993 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 994 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 995 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 996 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 997 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJCUSPARSE; 998 999 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 1000 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 1001 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 1002 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 1003 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 1004 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 1005 PetscFunctionReturn(0); 1006 } 1007 1008 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 1009 { 1010 PetscErrorCode ierr; 1011 1012 PetscFunctionBegin; 1013 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 1014 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 1015 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 1016 PetscFunctionReturn(0); 1017 } 1018 1019 /*@ 1020 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1021 (the default parallel PETSc format). This matrix will ultimately pushed down 1022 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1023 assembly performance the user should preallocate the matrix storage by setting 1024 the parameter nz (or the array nnz). By setting these parameters accurately, 1025 performance during matrix assembly can be increased by more than a factor of 50. 1026 1027 Collective 1028 1029 Input Parameters: 1030 + comm - MPI communicator, set to PETSC_COMM_SELF 1031 . m - number of rows 1032 . n - number of columns 1033 . nz - number of nonzeros per row (same for all rows) 1034 - nnz - array containing the number of nonzeros in the various rows 1035 (possibly different for each row) or NULL 1036 1037 Output Parameter: 1038 . A - the matrix 1039 1040 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1041 MatXXXXSetPreallocation() paradigm instead of this routine directly. 1042 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1043 1044 Notes: 1045 If nnz is given then nz is ignored 1046 1047 The AIJ format (also called the Yale sparse matrix format or 1048 compressed row storage), is fully compatible with standard Fortran 77 1049 storage. That is, the stored row and column indices can begin at 1050 either one (as in Fortran) or zero. See the users' manual for details. 1051 1052 Specify the preallocated storage with either nz or nnz (not both). 1053 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1054 allocation. For large problems you MUST preallocate memory or you 1055 will get TERRIBLE performance, see the users' manual chapter on matrices. 1056 1057 By default, this format uses inodes (identical nodes) when possible, to 1058 improve numerical efficiency of matrix-vector products and solves. We 1059 search for consecutive rows with the same nonzero structure, thereby 1060 reusing matrix information to achieve increased efficiency. 1061 1062 Level: intermediate 1063 1064 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 1065 @*/ 1066 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 1067 { 1068 PetscErrorCode ierr; 1069 PetscMPIInt size; 1070 1071 PetscFunctionBegin; 1072 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1073 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 1074 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1075 if (size > 1) { 1076 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 1077 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 1078 } else { 1079 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1080 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 1081 } 1082 PetscFunctionReturn(0); 1083 } 1084 1085 /*MC 1086 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 1087 1088 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1089 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1090 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1091 1092 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 1093 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 1094 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 1095 for communicators controlling multiple processes. It is recommended that you call both of 1096 the above preallocation routines for simplicity. 1097 1098 Options Database Keys: 1099 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 1100 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1101 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1102 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1103 1104 Level: beginner 1105 1106 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1107 M 1108 M*/ 1109 1110 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 1111 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 1112 { 1113 #if defined(PETSC_USE_CTABLE) 1114 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 1115 #else 1116 PetscSplitCSRDataStructure **p_d_mat; 1117 PetscMPIInt size,rank; 1118 MPI_Comm comm; 1119 PetscErrorCode ierr; 1120 int *ai,*bi,*aj,*bj; 1121 PetscScalar *aa,*ba; 1122 1123 PetscFunctionBegin; 1124 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1125 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1126 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 1127 if (A->factortype == MAT_FACTOR_NONE) { 1128 CsrMatrix *matrixA,*matrixB=NULL; 1129 if (size == 1) { 1130 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1131 p_d_mat = &cusparsestruct->deviceMat; 1132 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1133 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1134 matrixA = (CsrMatrix*)matstruct->mat; 1135 bi = bj = NULL; ba = NULL; 1136 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 1137 } else { 1138 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1139 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 1140 p_d_mat = &spptr->deviceMat; 1141 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 1142 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 1143 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 1144 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 1145 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 1146 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 1147 matrixA = (CsrMatrix*)matstructA->mat; 1148 matrixB = (CsrMatrix*)matstructB->mat; 1149 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 1150 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 1151 ba = thrust::raw_pointer_cast(matrixB->values->data()); 1152 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 1153 } 1154 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 1155 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 1156 aa = thrust::raw_pointer_cast(matrixA->values->data()); 1157 } else { 1158 *B = NULL; 1159 PetscFunctionReturn(0); 1160 } 1161 // act like MatSetValues because not called on host 1162 if (A->assembled) { 1163 if (A->was_assembled) { 1164 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 1165 } 1166 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 1167 } else { 1168 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 1169 } 1170 if (!*p_d_mat) { 1171 cudaError_t err; 1172 PetscSplitCSRDataStructure *d_mat, h_mat; 1173 Mat_SeqAIJ *jaca; 1174 PetscInt n = A->rmap->n, nnz; 1175 // create and copy 1176 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 1177 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 1178 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 1179 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 1180 if (size == 1) { 1181 jaca = (Mat_SeqAIJ*)A->data; 1182 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 1183 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 1184 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 1185 h_mat.offdiag.a = NULL; 1186 } else { 1187 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1188 Mat_SeqAIJ *jacb; 1189 jaca = (Mat_SeqAIJ*)aij->A->data; 1190 jacb = (Mat_SeqAIJ*)aij->B->data; 1191 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 1192 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 1193 // create colmap - this is ussually done (lazy) in MatSetValues 1194 aij->donotstash = PETSC_TRUE; 1195 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 1196 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 1197 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 1198 aij->colmap[A->cmap->N] = -9; 1199 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 1200 { 1201 PetscInt ii; 1202 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 1203 } 1204 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 1205 // allocate B copy data 1206 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 1207 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 1208 nnz = jacb->i[n]; 1209 1210 if (jacb->compressedrow.use) { 1211 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 1212 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1213 } else h_mat.offdiag.i = bi; 1214 h_mat.offdiag.j = bj; 1215 h_mat.offdiag.a = ba; 1216 1217 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 1218 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1219 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 1220 h_mat.offdiag.n = n; 1221 } 1222 // allocate A copy data 1223 nnz = jaca->i[n]; 1224 h_mat.diag.n = n; 1225 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 1226 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 1227 if (jaca->compressedrow.use) { 1228 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 1229 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1230 } else { 1231 h_mat.diag.i = ai; 1232 } 1233 h_mat.diag.j = aj; 1234 h_mat.diag.a = aa; 1235 // copy pointers and metdata to device 1236 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1237 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 1238 } else { 1239 *B = *p_d_mat; 1240 } 1241 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 1242 PetscFunctionReturn(0); 1243 #endif 1244 } 1245