1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 cudaError_t cerr; 29 30 PetscFunctionBegin; 31 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 32 if (cusp->coo_p && v) { 33 thrust::device_ptr<const PetscScalar> d_v; 34 THRUSTARRAY *w = NULL; 35 36 if (isCudaMem(v)) { 37 d_v = thrust::device_pointer_cast(v); 38 } else { 39 w = new THRUSTARRAY(n); 40 w->assign(v,v+n); 41 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 42 d_v = w->data(); 43 } 44 45 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 46 cusp->coo_pw->begin())); 47 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 48 cusp->coo_pw->end())); 49 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 50 thrust::for_each(zibit,zieit,VecCUDAEquals()); 51 cerr = WaitForCUDA();CHKERRCUDA(cerr); 52 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 53 delete w; 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 55 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 56 } else { 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 58 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 59 } 60 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 61 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 62 A->num_ass++; 63 A->assembled = PETSC_TRUE; 64 A->ass_nonzerostate = A->nonzerostate; 65 A->offloadmask = PETSC_OFFLOAD_GPU; 66 PetscFunctionReturn(0); 67 } 68 69 template <typename Tuple> 70 struct IsNotOffDiagT 71 { 72 PetscInt _cstart,_cend; 73 74 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 75 __host__ __device__ 76 inline bool operator()(Tuple t) 77 { 78 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 79 } 80 }; 81 82 struct IsOffDiag 83 { 84 PetscInt _cstart,_cend; 85 86 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 87 __host__ __device__ 88 inline bool operator() (const PetscInt &c) 89 { 90 return c < _cstart || c >= _cend; 91 } 92 }; 93 94 struct GlobToLoc 95 { 96 PetscInt _start; 97 98 GlobToLoc(PetscInt start) : _start(start) {} 99 __host__ __device__ 100 inline PetscInt operator() (const PetscInt &c) 101 { 102 return c - _start; 103 } 104 }; 105 106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 107 { 108 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 109 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 110 PetscErrorCode ierr; 111 PetscInt *jj; 112 size_t noff = 0; 113 THRUSTINTARRAY d_i(n); 114 THRUSTINTARRAY d_j(n); 115 ISLocalToGlobalMapping l2g; 116 cudaError_t cerr; 117 118 PetscFunctionBegin; 119 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 120 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 121 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 122 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 123 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 124 ierr = PetscFree(b->garray);CHKERRQ(ierr); 125 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 126 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 127 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 128 129 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 130 d_i.assign(coo_i,coo_i+n); 131 d_j.assign(coo_j,coo_j+n); 132 delete cusp->coo_p; 133 delete cusp->coo_pw; 134 cusp->coo_p = NULL; 135 cusp->coo_pw = NULL; 136 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 137 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 138 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 139 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 140 cusp->coo_p = new THRUSTINTARRAY(n); 141 cusp->coo_pw = new THRUSTARRAY(n); 142 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 143 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 144 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 145 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 146 firstoffd = mzipp.get_iterator_tuple().get<1>(); 147 } 148 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 149 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 150 151 /* from global to local */ 152 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 153 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 154 cerr = WaitForCUDA();CHKERRCUDA(cerr); 155 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 156 157 /* copy offdiag column indices to map on the CPU */ 158 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 159 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 160 auto o_j = d_j.begin(); 161 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 162 thrust::advance(o_j,cusp->coo_nd); 163 thrust::sort(thrust::device,o_j,d_j.end()); 164 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 165 cerr = WaitForCUDA();CHKERRCUDA(cerr); 166 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 167 noff = thrust::distance(o_j,wit); 168 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 169 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 170 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 171 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 172 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 173 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 174 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 175 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 176 177 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 178 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 179 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 180 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 181 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 182 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 183 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 184 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 185 186 /* GPU memory, cusparse specific call handles it internally */ 187 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 188 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 189 ierr = PetscFree(jj);CHKERRQ(ierr); 190 191 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 192 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 193 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 194 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 195 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 196 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 197 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 198 B->preallocated = PETSC_TRUE; 199 B->nonzerostate++; 200 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 201 202 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 203 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 204 B->offloadmask = PETSC_OFFLOAD_CPU; 205 B->assembled = PETSC_FALSE; 206 B->was_assembled = PETSC_FALSE; 207 PetscFunctionReturn(0); 208 } 209 210 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 211 { 212 Mat Ad,Ao; 213 const PetscInt *cmap; 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 218 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 219 if (glob) { 220 PetscInt cst, i, dn, on, *gidx; 221 222 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 223 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 224 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 225 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 226 for (i=0; i<dn; i++) gidx[i] = cst + i; 227 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 228 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 229 } 230 PetscFunctionReturn(0); 231 } 232 233 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 234 { 235 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 236 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 237 PetscErrorCode ierr; 238 PetscInt i; 239 240 PetscFunctionBegin; 241 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 242 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 243 if (PetscDefined(USE_DEBUG) && d_nnz) { 244 for (i=0; i<B->rmap->n; i++) { 245 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 246 } 247 } 248 if (PetscDefined(USE_DEBUG) && o_nnz) { 249 for (i=0; i<B->rmap->n; i++) { 250 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 251 } 252 } 253 if (!B->preallocated) { 254 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 255 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 256 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 257 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 258 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 259 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 260 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 261 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 262 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 263 } 264 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 265 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 266 if (b->lvec) { 267 ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 268 } 269 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 270 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 271 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 272 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 273 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 274 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 275 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 276 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 277 278 B->preallocated = PETSC_TRUE; 279 PetscFunctionReturn(0); 280 } 281 282 typedef struct { 283 Mat *mp; /* intermediate products */ 284 PetscBool *mptmp; /* is the intermediate product temporary ? */ 285 PetscInt cp; /* number of intermediate products */ 286 287 /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */ 288 PetscInt *startsj_s,*startsj_r; 289 PetscScalar *bufa; 290 Mat P_oth; 291 292 /* may take advantage of merging product->B */ 293 Mat Bloc; 294 295 /* cusparse does not have support to split between symbolic and numeric phases 296 When api_user is true, we don't need to update the numerical values 297 of the temporary storage */ 298 PetscBool reusesym; 299 300 /* support for COO values insertion */ 301 PetscScalar *coo_v,*coo_w; 302 PetscSF sf; /* if present, non-local values insertion (i.e. AtB or PtAP) */ 303 } MatMatMPIAIJCUSPARSE; 304 305 PetscErrorCode MatDestroy_MatMatMPIAIJCUSPARSE(void *data) 306 { 307 MatMatMPIAIJCUSPARSE *mmdata = (MatMatMPIAIJCUSPARSE*)data; 308 PetscInt i; 309 PetscErrorCode ierr; 310 311 PetscFunctionBegin; 312 ierr = PetscFree2(mmdata->startsj_s,mmdata->startsj_r);CHKERRQ(ierr); 313 ierr = PetscFree(mmdata->bufa);CHKERRQ(ierr); 314 ierr = PetscFree(mmdata->coo_v);CHKERRQ(ierr); 315 ierr = PetscFree(mmdata->coo_w);CHKERRQ(ierr); 316 ierr = MatDestroy(&mmdata->P_oth);CHKERRQ(ierr); 317 ierr = MatDestroy(&mmdata->Bloc);CHKERRQ(ierr); 318 ierr = PetscSFDestroy(&mmdata->sf);CHKERRQ(ierr); 319 for (i = 0; i < mmdata->cp; i++) { 320 ierr = MatDestroy(&mmdata->mp[i]);CHKERRQ(ierr); 321 } 322 ierr = PetscFree(mmdata->mp);CHKERRQ(ierr); 323 ierr = PetscFree(mmdata->mptmp);CHKERRQ(ierr); 324 ierr = PetscFree(mmdata);CHKERRQ(ierr); 325 PetscFunctionReturn(0); 326 } 327 328 static PetscErrorCode MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C) 329 { 330 MatMatMPIAIJCUSPARSE *mmdata; 331 PetscScalar *tmp; 332 PetscInt i,n; 333 PetscErrorCode ierr; 334 335 PetscFunctionBegin; 336 MatCheckProduct(C,1); 337 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 338 mmdata = (MatMatMPIAIJCUSPARSE*)C->product->data; 339 tmp = mmdata->sf ? mmdata->coo_w : mmdata->coo_v; 340 if (!mmdata->reusesym) { /* update temporary matrices */ 341 if (mmdata->P_oth) { 342 ierr = MatGetBrowsOfAoCols_MPIAIJ(C->product->A,C->product->B,MAT_REUSE_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 343 } 344 if (mmdata->Bloc) { 345 ierr = MatMPIAIJGetLocalMatMerge(C->product->B,MAT_REUSE_MATRIX,NULL,&mmdata->Bloc);CHKERRQ(ierr); 346 } 347 } 348 mmdata->reusesym = PETSC_FALSE; 349 for (i = 0, n = 0; i < mmdata->cp; i++) { 350 Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mmdata->mp[i]->data; 351 const PetscScalar *vv; 352 353 if (!mmdata->mp[i]->ops->productnumeric) SETERRQ1(PetscObjectComm((PetscObject)mmdata->mp[i]),PETSC_ERR_PLIB,"Missing numeric op for %s",MatProductTypes[mmdata->mp[i]->product->type]); 354 ierr = (*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]);CHKERRQ(ierr); 355 if (mmdata->mptmp[i]) continue; 356 /* TODO: add support for using GPU data directly */ 357 ierr = MatSeqAIJGetArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr); 358 ierr = PetscArraycpy(tmp + n,vv,mm->nz);CHKERRQ(ierr); 359 ierr = MatSeqAIJRestoreArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr); 360 n += mm->nz; 361 } 362 if (mmdata->sf) { /* offprocess insertion */ 363 ierr = PetscSFGatherBegin(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr); 364 ierr = PetscSFGatherEnd(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr); 365 } 366 ierr = MatSetValuesCOO(C,mmdata->coo_v,INSERT_VALUES);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 /* Pt * A or A * P */ 371 #define MAX_NUMBER_INTERMEDIATE 4 372 static PetscErrorCode MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C) 373 { 374 Mat_Product *product = C->product; 375 Mat A,P,mp[MAX_NUMBER_INTERMEDIATE]; 376 Mat_MPIAIJ *a,*p; 377 MatMatMPIAIJCUSPARSE *mmdata; 378 ISLocalToGlobalMapping P_oth_l2g = NULL; 379 IS glob = NULL; 380 const PetscInt *globidx,*P_oth_idx; 381 const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE],*rmapa[MAX_NUMBER_INTERMEDIATE]; 382 PetscInt cp = 0,m,n,M,N,ncoo,*coo_i,*coo_j,cmapt[MAX_NUMBER_INTERMEDIATE],rmapt[MAX_NUMBER_INTERMEDIATE],i,j; 383 MatProductType ptype; 384 PetscBool mptmp[MAX_NUMBER_INTERMEDIATE],hasoffproc = PETSC_FALSE; 385 PetscMPIInt size; 386 PetscErrorCode ierr; 387 388 PetscFunctionBegin; 389 MatCheckProduct(C,1); 390 if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 391 ptype = product->type; 392 if (product->A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 393 switch (ptype) { 394 case MATPRODUCT_AB: 395 A = product->A; 396 P = product->B; 397 m = A->rmap->n; 398 n = P->cmap->n; 399 M = A->rmap->N; 400 N = P->cmap->N; 401 break; 402 case MATPRODUCT_AtB: 403 P = product->A; 404 A = product->B; 405 m = P->cmap->n; 406 n = A->cmap->n; 407 M = P->cmap->N; 408 N = A->cmap->N; 409 hasoffproc = PETSC_TRUE; 410 break; 411 case MATPRODUCT_PtAP: 412 A = product->A; 413 P = product->B; 414 m = P->cmap->n; 415 n = P->cmap->n; 416 M = P->cmap->N; 417 N = P->cmap->N; 418 hasoffproc = PETSC_TRUE; 419 break; 420 default: 421 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 422 } 423 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)C),&size);CHKERRQ(ierr); 424 if (size == 1) hasoffproc = PETSC_FALSE; 425 426 for (i=0;i<MAX_NUMBER_INTERMEDIATE;i++) { 427 mp[i] = NULL; 428 mptmp[i] = PETSC_FALSE; 429 rmapt[i] = 0; 430 cmapt[i] = 0; 431 rmapa[i] = NULL; 432 cmapa[i] = NULL; 433 } 434 435 a = (Mat_MPIAIJ*)A->data; 436 p = (Mat_MPIAIJ*)P->data; 437 ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr); 438 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 439 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 440 ierr = MatSetType(C,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 441 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 442 mmdata->reusesym = product->api_user; 443 switch (ptype) { 444 case MATPRODUCT_AB: /* A * P */ 445 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 446 447 if (1) { /* A_diag * P_loc and A_off * P_oth TODO: add customization for this */ 448 /* P is product->B */ 449 ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 450 ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 451 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 452 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 453 mp[cp]->product->api_user = product->api_user; 454 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 455 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 456 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 457 ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 458 rmapt[cp] = 1; 459 cmapt[cp] = 2; 460 cmapa[cp] = globidx; 461 mptmp[cp] = PETSC_FALSE; 462 cp++; 463 } else { /* A_diag * P_diag and A_diag * P_off and A_off * P_oth */ 464 ierr = MatProductCreate(a->A,p->A,NULL,&mp[cp]);CHKERRQ(ierr); 465 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 466 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 467 mp[cp]->product->api_user = product->api_user; 468 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 469 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 470 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 471 rmapt[cp] = 1; 472 cmapt[cp] = 1; 473 mptmp[cp] = PETSC_FALSE; 474 cp++; 475 ierr = MatProductCreate(a->A,p->B,NULL,&mp[cp]);CHKERRQ(ierr); 476 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 477 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 478 mp[cp]->product->api_user = product->api_user; 479 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 480 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 481 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 482 rmapt[cp] = 1; 483 cmapt[cp] = 2; 484 cmapa[cp] = p->garray; 485 mptmp[cp] = PETSC_FALSE; 486 cp++; 487 } 488 if (mmdata->P_oth) { 489 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr); 490 ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 491 ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr); 492 ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr); 493 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 494 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 495 mp[cp]->product->api_user = product->api_user; 496 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 497 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 498 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 499 rmapt[cp] = 1; 500 cmapt[cp] = 2; 501 cmapa[cp] = P_oth_idx; 502 mptmp[cp] = PETSC_FALSE; 503 cp++; 504 } 505 break; 506 case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */ 507 /* A is product->B */ 508 ierr = MatMPIAIJGetLocalMatMerge(A,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 509 ierr = MatProductCreate(p->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 510 ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 511 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 512 mp[cp]->product->api_user = product->api_user; 513 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 514 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 515 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 516 ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 517 rmapt[cp] = 1; 518 cmapt[cp] = 2; 519 cmapa[cp] = globidx; 520 mptmp[cp] = PETSC_FALSE; 521 cp++; 522 ierr = MatProductCreate(p->B,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 523 ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 524 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 525 mp[cp]->product->api_user = product->api_user; 526 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 527 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 528 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 529 rmapt[cp] = 2; 530 rmapa[cp] = p->garray; 531 cmapt[cp] = 2; 532 cmapa[cp] = globidx; 533 mptmp[cp] = PETSC_FALSE; 534 cp++; 535 break; 536 case MATPRODUCT_PtAP: 537 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 538 /* P is product->B */ 539 ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 540 ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 541 ierr = MatProductSetType(mp[cp],MATPRODUCT_PtAP);CHKERRQ(ierr); 542 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 543 mp[cp]->product->api_user = product->api_user; 544 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 545 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 546 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 547 ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 548 rmapt[cp] = 2; 549 rmapa[cp] = globidx; 550 cmapt[cp] = 2; 551 cmapa[cp] = globidx; 552 mptmp[cp] = PETSC_FALSE; 553 cp++; 554 if (mmdata->P_oth) { 555 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr); 556 ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr); 557 ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 558 ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr); 559 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 560 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 561 mp[cp]->product->api_user = product->api_user; 562 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 563 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 564 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 565 mptmp[cp] = PETSC_TRUE; 566 cp++; 567 ierr = MatProductCreate(mmdata->Bloc,mp[1],NULL,&mp[cp]);CHKERRQ(ierr); 568 ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 569 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 570 mp[cp]->product->api_user = product->api_user; 571 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 572 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 573 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 574 rmapt[cp] = 2; 575 rmapa[cp] = globidx; 576 cmapt[cp] = 2; 577 cmapa[cp] = P_oth_idx; 578 mptmp[cp] = PETSC_FALSE; 579 cp++; 580 } 581 break; 582 default: 583 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 584 } 585 ierr = PetscMalloc1(cp,&mmdata->mp);CHKERRQ(ierr); 586 for (i = 0; i < cp; i++) mmdata->mp[i] = mp[i]; 587 ierr = PetscMalloc1(cp,&mmdata->mptmp);CHKERRQ(ierr); 588 for (i = 0; i < cp; i++) mmdata->mptmp[i] = mptmp[i]; 589 mmdata->cp = cp; 590 C->product->data = mmdata; 591 C->product->destroy = MatDestroy_MatMatMPIAIJCUSPARSE; 592 C->ops->productnumeric = MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE; 593 594 /* prepare coo coordinates for values insertion */ 595 ncoo = 0; 596 for (cp = 0; cp < mmdata->cp; cp++) { 597 Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data; 598 if (mptmp[cp]) continue; 599 ncoo += mm->nz; 600 } 601 ierr = PetscMalloc2(ncoo,&coo_i,ncoo,&coo_j);CHKERRQ(ierr); 602 ncoo = 0; 603 for (cp = 0; cp < mmdata->cp; cp++) { 604 Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data; 605 PetscInt *coi = coo_i + ncoo; 606 PetscInt *coj = coo_j + ncoo; 607 const PetscInt mr = mp[cp]->rmap->n; 608 const PetscInt *jj = mm->j; 609 const PetscInt *ii = mm->i; 610 611 if (mptmp[cp]) continue; 612 /* rows coo */ 613 if (!rmapt[cp]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"This should not happen"); 614 else if (rmapt[cp] == 1) { /* local to global for owned rows of */ 615 const PetscInt rs = C->rmap->rstart; 616 for (i = 0; i < mr; i++) { 617 const PetscInt gr = i + rs; 618 for (j = ii[i]; j < ii[i+1]; j++) { 619 coi[j] = gr; 620 } 621 } 622 } else { /* offprocess */ 623 const PetscInt *rmap = rmapa[cp]; 624 for (i = 0; i < mr; i++) { 625 const PetscInt gr = rmap[i]; 626 for (j = ii[i]; j < ii[i+1]; j++) { 627 coi[j] = gr; 628 } 629 } 630 } 631 /* columns coo */ 632 if (!cmapt[cp]) { 633 ierr = PetscArraycpy(coj,jj,mm->nz);CHKERRQ(ierr); 634 } else if (cmapt[cp] == 1) { /* local to global for owned columns of P */ 635 const PetscInt cs = P->cmap->rstart; 636 for (j = 0; j < mm->nz; j++) { 637 coj[j] = jj[j] + cs; 638 } 639 } else { /* offdiag */ 640 const PetscInt *cmap = cmapa[cp]; 641 for (j = 0; j < mm->nz; j++) { 642 coj[j] = cmap[jj[j]]; 643 } 644 } 645 ncoo += mm->nz; 646 } 647 if (glob) { 648 ierr = ISRestoreIndices(glob,&globidx);CHKERRQ(ierr); 649 } 650 ierr = ISDestroy(&glob);CHKERRQ(ierr); 651 if (P_oth_l2g) { 652 ierr = ISLocalToGlobalMappingRestoreIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 653 } 654 ierr = ISLocalToGlobalMappingDestroy(&P_oth_l2g);CHKERRQ(ierr); 655 656 if (hasoffproc) { /* offproc values insertion */ 657 const PetscInt *sfdeg; 658 const PetscInt n = P->cmap->n; 659 PetscInt ncoo2,*coo_i2,*coo_j2; 660 661 ierr = PetscSFCreate(PetscObjectComm((PetscObject)C),&mmdata->sf);CHKERRQ(ierr); 662 ierr = PetscSFSetGraphLayout(mmdata->sf,P->cmap,ncoo,NULL,PETSC_OWN_POINTER,coo_i);CHKERRQ(ierr); 663 ierr = PetscSFComputeDegreeBegin(mmdata->sf,&sfdeg);CHKERRQ(ierr); 664 ierr = PetscSFComputeDegreeEnd(mmdata->sf,&sfdeg);CHKERRQ(ierr); 665 for (i = 0, ncoo2 = 0; i < n; i++) ncoo2 += sfdeg[i]; 666 ierr = PetscMalloc2(ncoo2,&coo_i2,ncoo2,&coo_j2);CHKERRQ(ierr); 667 ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr); 668 ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr); 669 ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr); 670 ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr); 671 ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr); 672 ierr = PetscMalloc1(ncoo,&mmdata->coo_w);CHKERRQ(ierr); 673 coo_i = coo_i2; 674 coo_j = coo_j2; 675 ncoo = ncoo2; 676 } 677 678 /* preallocate with COO data */ 679 ierr = MatSetPreallocationCOO(C,ncoo,coo_i,coo_j);CHKERRQ(ierr); 680 ierr = PetscMalloc1(ncoo,&mmdata->coo_v);CHKERRQ(ierr); 681 ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 static PetscErrorCode MatProductSetFromOptions_MPIAIJCUSPARSE(Mat mat) 686 { 687 Mat_Product *product = mat->product; 688 PetscErrorCode ierr; 689 PetscBool Biscusp = PETSC_FALSE; 690 691 PetscFunctionBegin; 692 MatCheckProduct(mat,1); 693 if (!product->B->boundtocpu) { 694 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATMPIAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 695 } 696 if (Biscusp) { 697 switch (product->type) { 698 case MATPRODUCT_AB: 699 case MATPRODUCT_AtB: 700 case MATPRODUCT_PtAP: 701 mat->ops->productsymbolic = MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE; 702 break; 703 default: 704 break; 705 } 706 } 707 if (!mat->ops->productsymbolic) { 708 ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr); 709 } 710 PetscFunctionReturn(0); 711 } 712 713 /*@ 714 MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 715 716 Not collective 717 718 Input Parameters: 719 + A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE 720 - gen - the boolean flag 721 722 Level: intermediate 723 724 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE 725 @*/ 726 PetscErrorCode MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen) 727 { 728 PetscErrorCode ierr; 729 PetscBool ismpiaij; 730 731 PetscFunctionBegin; 732 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 733 MatCheckPreallocated(A,1); 734 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr); 735 if (ismpiaij) { 736 Mat A_d,A_o; 737 738 ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr); 739 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr); 740 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr); 741 } else { 742 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr); 743 } 744 PetscFunctionReturn(0); 745 } 746 747 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 748 { 749 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 750 PetscErrorCode ierr; 751 PetscInt nt; 752 753 PetscFunctionBegin; 754 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 755 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 756 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 757 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 758 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 759 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 760 PetscFunctionReturn(0); 761 } 762 763 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 764 { 765 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 766 PetscErrorCode ierr; 767 768 PetscFunctionBegin; 769 if (A->factortype == MAT_FACTOR_NONE) { 770 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 771 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 772 if (d_mat) { 773 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 774 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 775 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 776 cudaError_t err; 777 PetscScalar *vals; 778 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 779 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 780 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 781 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 782 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 783 } 784 } 785 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 786 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 787 788 PetscFunctionReturn(0); 789 } 790 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 791 { 792 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 793 PetscErrorCode ierr; 794 PetscInt nt; 795 796 PetscFunctionBegin; 797 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 798 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 799 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 800 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 801 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 802 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 803 PetscFunctionReturn(0); 804 } 805 806 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 807 { 808 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 809 PetscErrorCode ierr; 810 PetscInt nt; 811 812 PetscFunctionBegin; 813 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 814 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 815 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 816 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 817 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 818 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 819 PetscFunctionReturn(0); 820 } 821 822 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 823 { 824 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 825 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 826 827 PetscFunctionBegin; 828 switch (op) { 829 case MAT_CUSPARSE_MULT_DIAG: 830 cusparseStruct->diagGPUMatFormat = format; 831 break; 832 case MAT_CUSPARSE_MULT_OFFDIAG: 833 cusparseStruct->offdiagGPUMatFormat = format; 834 break; 835 case MAT_CUSPARSE_ALL: 836 cusparseStruct->diagGPUMatFormat = format; 837 cusparseStruct->offdiagGPUMatFormat = format; 838 break; 839 default: 840 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 841 } 842 PetscFunctionReturn(0); 843 } 844 845 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 846 { 847 MatCUSPARSEStorageFormat format; 848 PetscErrorCode ierr; 849 PetscBool flg; 850 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 851 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 852 853 PetscFunctionBegin; 854 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 855 if (A->factortype==MAT_FACTOR_NONE) { 856 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 857 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 858 if (flg) { 859 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 860 } 861 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 862 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 863 if (flg) { 864 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 865 } 866 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 867 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 868 if (flg) { 869 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 870 } 871 } 872 ierr = PetscOptionsTail();CHKERRQ(ierr); 873 PetscFunctionReturn(0); 874 } 875 876 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 877 { 878 PetscErrorCode ierr; 879 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 880 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 881 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 882 PetscFunctionBegin; 883 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 884 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 885 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 886 } 887 if (d_mat) { 888 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 889 } 890 891 PetscFunctionReturn(0); 892 } 893 894 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 895 { 896 PetscErrorCode ierr; 897 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 898 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 899 cudaError_t err; 900 cusparseStatus_t stat; 901 902 PetscFunctionBegin; 903 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 904 if (cusparseStruct->deviceMat) { 905 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 906 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 907 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 908 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 909 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 910 if (jaca->compressedrow.use) { 911 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 912 } 913 if (jacb->compressedrow.use) { 914 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 915 } 916 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 917 err = cudaFree(d_mat);CHKERRCUDA(err); 918 } 919 try { 920 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 921 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 922 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 923 if (cusparseStruct->stream) { 924 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 925 } 926 delete cusparseStruct->coo_p; 927 delete cusparseStruct->coo_pw; 928 delete cusparseStruct; 929 } catch(char *ex) { 930 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 931 } 932 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 933 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 934 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 935 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 936 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 937 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 938 PetscFunctionReturn(0); 939 } 940 941 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 942 { 943 PetscErrorCode ierr; 944 Mat_MPIAIJ *a; 945 Mat_MPIAIJCUSPARSE *cusparseStruct; 946 cusparseStatus_t stat; 947 Mat A; 948 949 PetscFunctionBegin; 950 if (reuse == MAT_INITIAL_MATRIX) { 951 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 952 } else if (reuse == MAT_REUSE_MATRIX) { 953 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 954 } 955 A = *newmat; 956 957 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 958 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 959 960 a = (Mat_MPIAIJ*)A->data; 961 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 962 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 963 if (a->lvec) { 964 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 965 } 966 967 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 968 a->spptr = new Mat_MPIAIJCUSPARSE; 969 970 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 971 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 972 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 973 cusparseStruct->coo_p = NULL; 974 cusparseStruct->coo_pw = NULL; 975 cusparseStruct->stream = 0; 976 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 977 cusparseStruct->deviceMat = NULL; 978 } 979 980 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 981 A->ops->mult = MatMult_MPIAIJCUSPARSE; 982 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 983 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 984 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 985 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 986 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 987 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJCUSPARSE; 988 989 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 990 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 991 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 992 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 993 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 994 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 995 PetscFunctionReturn(0); 996 } 997 998 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 999 { 1000 PetscErrorCode ierr; 1001 1002 PetscFunctionBegin; 1003 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 1004 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 1005 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 1006 PetscFunctionReturn(0); 1007 } 1008 1009 /*@ 1010 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1011 (the default parallel PETSc format). This matrix will ultimately pushed down 1012 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1013 assembly performance the user should preallocate the matrix storage by setting 1014 the parameter nz (or the array nnz). By setting these parameters accurately, 1015 performance during matrix assembly can be increased by more than a factor of 50. 1016 1017 Collective 1018 1019 Input Parameters: 1020 + comm - MPI communicator, set to PETSC_COMM_SELF 1021 . m - number of rows 1022 . n - number of columns 1023 . nz - number of nonzeros per row (same for all rows) 1024 - nnz - array containing the number of nonzeros in the various rows 1025 (possibly different for each row) or NULL 1026 1027 Output Parameter: 1028 . A - the matrix 1029 1030 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1031 MatXXXXSetPreallocation() paradigm instead of this routine directly. 1032 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1033 1034 Notes: 1035 If nnz is given then nz is ignored 1036 1037 The AIJ format (also called the Yale sparse matrix format or 1038 compressed row storage), is fully compatible with standard Fortran 77 1039 storage. That is, the stored row and column indices can begin at 1040 either one (as in Fortran) or zero. See the users' manual for details. 1041 1042 Specify the preallocated storage with either nz or nnz (not both). 1043 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1044 allocation. For large problems you MUST preallocate memory or you 1045 will get TERRIBLE performance, see the users' manual chapter on matrices. 1046 1047 By default, this format uses inodes (identical nodes) when possible, to 1048 improve numerical efficiency of matrix-vector products and solves. We 1049 search for consecutive rows with the same nonzero structure, thereby 1050 reusing matrix information to achieve increased efficiency. 1051 1052 Level: intermediate 1053 1054 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 1055 @*/ 1056 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 1057 { 1058 PetscErrorCode ierr; 1059 PetscMPIInt size; 1060 1061 PetscFunctionBegin; 1062 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1063 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 1064 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1065 if (size > 1) { 1066 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 1067 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 1068 } else { 1069 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1070 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 1071 } 1072 PetscFunctionReturn(0); 1073 } 1074 1075 /*MC 1076 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 1077 1078 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1079 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1080 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1081 1082 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 1083 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 1084 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 1085 for communicators controlling multiple processes. It is recommended that you call both of 1086 the above preallocation routines for simplicity. 1087 1088 Options Database Keys: 1089 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 1090 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1091 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1092 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1093 1094 Level: beginner 1095 1096 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1097 M 1098 M*/ 1099 1100 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 1101 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 1102 { 1103 #if defined(PETSC_USE_CTABLE) 1104 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 1105 #else 1106 PetscSplitCSRDataStructure **p_d_mat; 1107 PetscMPIInt size,rank; 1108 MPI_Comm comm; 1109 PetscErrorCode ierr; 1110 int *ai,*bi,*aj,*bj; 1111 PetscScalar *aa,*ba; 1112 1113 PetscFunctionBegin; 1114 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1115 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1116 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 1117 if (A->factortype == MAT_FACTOR_NONE) { 1118 CsrMatrix *matrixA,*matrixB=NULL; 1119 if (size == 1) { 1120 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1121 p_d_mat = &cusparsestruct->deviceMat; 1122 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1123 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1124 matrixA = (CsrMatrix*)matstruct->mat; 1125 bi = bj = NULL; ba = NULL; 1126 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 1127 } else { 1128 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1129 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 1130 p_d_mat = &spptr->deviceMat; 1131 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 1132 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 1133 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 1134 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 1135 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 1136 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 1137 matrixA = (CsrMatrix*)matstructA->mat; 1138 matrixB = (CsrMatrix*)matstructB->mat; 1139 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 1140 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 1141 ba = thrust::raw_pointer_cast(matrixB->values->data()); 1142 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 1143 } 1144 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 1145 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 1146 aa = thrust::raw_pointer_cast(matrixA->values->data()); 1147 } else { 1148 *B = NULL; 1149 PetscFunctionReturn(0); 1150 } 1151 // act like MatSetValues because not called on host 1152 if (A->assembled) { 1153 if (A->was_assembled) { 1154 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 1155 } 1156 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 1157 } else { 1158 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 1159 } 1160 if (!*p_d_mat) { 1161 cudaError_t err; 1162 PetscSplitCSRDataStructure *d_mat, h_mat; 1163 Mat_SeqAIJ *jaca; 1164 PetscInt n = A->rmap->n, nnz; 1165 // create and copy 1166 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 1167 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 1168 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 1169 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 1170 if (size == 1) { 1171 jaca = (Mat_SeqAIJ*)A->data; 1172 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 1173 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 1174 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 1175 h_mat.offdiag.a = NULL; 1176 } else { 1177 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1178 Mat_SeqAIJ *jacb; 1179 jaca = (Mat_SeqAIJ*)aij->A->data; 1180 jacb = (Mat_SeqAIJ*)aij->B->data; 1181 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 1182 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 1183 // create colmap - this is ussually done (lazy) in MatSetValues 1184 aij->donotstash = PETSC_TRUE; 1185 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 1186 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 1187 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 1188 aij->colmap[A->cmap->N] = -9; 1189 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 1190 { 1191 PetscInt ii; 1192 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 1193 } 1194 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 1195 // allocate B copy data 1196 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 1197 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 1198 nnz = jacb->i[n]; 1199 1200 if (jacb->compressedrow.use) { 1201 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 1202 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1203 } else h_mat.offdiag.i = bi; 1204 h_mat.offdiag.j = bj; 1205 h_mat.offdiag.a = ba; 1206 1207 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 1208 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1209 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 1210 h_mat.offdiag.n = n; 1211 } 1212 // allocate A copy data 1213 nnz = jaca->i[n]; 1214 h_mat.diag.n = n; 1215 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 1216 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 1217 if (jaca->compressedrow.use) { 1218 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 1219 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1220 } else { 1221 h_mat.diag.i = ai; 1222 } 1223 h_mat.diag.j = aj; 1224 h_mat.diag.a = aa; 1225 // copy pointers and metdata to device 1226 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1227 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 1228 } else { 1229 *B = *p_d_mat; 1230 } 1231 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 1232 PetscFunctionReturn(0); 1233 #endif 1234 } 1235