1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 cudaError_t cerr; 29 30 PetscFunctionBegin; 31 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 32 if (cusp->coo_p && v) { 33 thrust::device_ptr<const PetscScalar> d_v; 34 THRUSTARRAY *w = NULL; 35 36 if (isCudaMem(v)) { 37 d_v = thrust::device_pointer_cast(v); 38 } else { 39 w = new THRUSTARRAY(n); 40 w->assign(v,v+n); 41 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 42 d_v = w->data(); 43 } 44 45 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 46 cusp->coo_pw->begin())); 47 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 48 cusp->coo_pw->end())); 49 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 50 thrust::for_each(zibit,zieit,VecCUDAEquals()); 51 cerr = WaitForCUDA();CHKERRCUDA(cerr); 52 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 53 delete w; 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 55 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 56 } else { 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 58 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 59 } 60 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 61 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 62 A->num_ass++; 63 A->assembled = PETSC_TRUE; 64 A->ass_nonzerostate = A->nonzerostate; 65 A->offloadmask = PETSC_OFFLOAD_GPU; 66 PetscFunctionReturn(0); 67 } 68 69 template <typename Tuple> 70 struct IsNotOffDiagT 71 { 72 PetscInt _cstart,_cend; 73 74 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 75 __host__ __device__ 76 inline bool operator()(Tuple t) 77 { 78 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 79 } 80 }; 81 82 struct IsOffDiag 83 { 84 PetscInt _cstart,_cend; 85 86 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 87 __host__ __device__ 88 inline bool operator() (const PetscInt &c) 89 { 90 return c < _cstart || c >= _cend; 91 } 92 }; 93 94 struct GlobToLoc 95 { 96 PetscInt _start; 97 98 GlobToLoc(PetscInt start) : _start(start) {} 99 __host__ __device__ 100 inline PetscInt operator() (const PetscInt &c) 101 { 102 return c - _start; 103 } 104 }; 105 106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 107 { 108 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 109 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 110 PetscErrorCode ierr; 111 PetscInt *jj; 112 size_t noff = 0; 113 THRUSTINTARRAY d_i(n); 114 THRUSTINTARRAY d_j(n); 115 ISLocalToGlobalMapping l2g; 116 cudaError_t cerr; 117 118 PetscFunctionBegin; 119 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 120 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 121 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 122 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 123 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 124 ierr = PetscFree(b->garray);CHKERRQ(ierr); 125 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 126 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 127 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 128 129 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 130 d_i.assign(coo_i,coo_i+n); 131 d_j.assign(coo_j,coo_j+n); 132 delete cusp->coo_p; 133 delete cusp->coo_pw; 134 cusp->coo_p = NULL; 135 cusp->coo_pw = NULL; 136 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 137 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 138 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 139 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 140 cusp->coo_p = new THRUSTINTARRAY(n); 141 cusp->coo_pw = new THRUSTARRAY(n); 142 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 143 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 144 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 145 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 146 firstoffd = mzipp.get_iterator_tuple().get<1>(); 147 } 148 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 149 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 150 151 /* from global to local */ 152 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 153 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 154 cerr = WaitForCUDA();CHKERRCUDA(cerr); 155 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 156 157 /* copy offdiag column indices to map on the CPU */ 158 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 159 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 160 auto o_j = d_j.begin(); 161 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 162 thrust::advance(o_j,cusp->coo_nd); 163 thrust::sort(thrust::device,o_j,d_j.end()); 164 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 165 cerr = WaitForCUDA();CHKERRCUDA(cerr); 166 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 167 noff = thrust::distance(o_j,wit); 168 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 169 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 170 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 171 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 172 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 173 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 174 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 175 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 176 177 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 178 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 179 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 180 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 181 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 182 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 183 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 184 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 185 186 /* GPU memory, cusparse specific call handles it internally */ 187 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 188 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 189 ierr = PetscFree(jj);CHKERRQ(ierr); 190 191 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 192 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 193 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 194 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 195 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 196 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 197 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 198 B->preallocated = PETSC_TRUE; 199 B->nonzerostate++; 200 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 201 202 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 203 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 204 B->offloadmask = PETSC_OFFLOAD_CPU; 205 B->assembled = PETSC_FALSE; 206 B->was_assembled = PETSC_FALSE; 207 PetscFunctionReturn(0); 208 } 209 210 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 211 { 212 Mat Ad,Ao; 213 const PetscInt *cmap; 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 218 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 219 if (glob) { 220 PetscInt cst, i, dn, on, *gidx; 221 222 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 223 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 224 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 225 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 226 for (i=0; i<dn; i++) gidx[i] = cst + i; 227 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 228 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 229 } 230 PetscFunctionReturn(0); 231 } 232 233 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 234 { 235 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 236 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 237 PetscErrorCode ierr; 238 PetscInt i; 239 240 PetscFunctionBegin; 241 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 242 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 243 if (PetscDefined(USE_DEBUG) && d_nnz) { 244 for (i=0; i<B->rmap->n; i++) { 245 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 246 } 247 } 248 if (PetscDefined(USE_DEBUG) && o_nnz) { 249 for (i=0; i<B->rmap->n; i++) { 250 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 251 } 252 } 253 if (!B->preallocated) { 254 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 255 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 256 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 257 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 258 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 259 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 260 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 261 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 262 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 263 } 264 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 265 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 266 if (b->lvec) { 267 ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 268 } 269 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 270 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 271 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 272 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 273 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 274 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 275 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 276 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 277 278 B->preallocated = PETSC_TRUE; 279 PetscFunctionReturn(0); 280 } 281 282 typedef struct { 283 Mat *mp; /* intermediate products */ 284 PetscBool *mptmp; /* is the intermediate product temporary ? */ 285 PetscInt cp; /* number of intermediate products */ 286 287 /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */ 288 PetscInt *startsj_s,*startsj_r; 289 PetscScalar *bufa; 290 Mat P_oth; 291 292 /* may take advantage of merging product->B */ 293 Mat Bloc; 294 295 /* cusparse does not have support to split between symbolic and numeric phases 296 When api_user is true, we don't need to update the numerical values 297 of the temporary storage */ 298 PetscBool reusesym; 299 300 /* support for COO values insertion */ 301 PetscScalar *coo_v,*coo_w; 302 PetscSF sf; /* if present, non-local values insertion (i.e. AtB or PtAP) */ 303 } MatMatMPIAIJCUSPARSE; 304 305 PetscErrorCode MatDestroy_MatMatMPIAIJCUSPARSE(void *data) 306 { 307 MatMatMPIAIJCUSPARSE *mmdata = (MatMatMPIAIJCUSPARSE*)data; 308 PetscInt i; 309 PetscErrorCode ierr; 310 311 PetscFunctionBegin; 312 ierr = PetscFree2(mmdata->startsj_s,mmdata->startsj_r);CHKERRQ(ierr); 313 ierr = PetscFree(mmdata->bufa);CHKERRQ(ierr); 314 ierr = PetscFree(mmdata->coo_v);CHKERRQ(ierr); 315 ierr = PetscFree(mmdata->coo_w);CHKERRQ(ierr); 316 ierr = MatDestroy(&mmdata->P_oth);CHKERRQ(ierr); 317 ierr = MatDestroy(&mmdata->Bloc);CHKERRQ(ierr); 318 ierr = PetscSFDestroy(&mmdata->sf);CHKERRQ(ierr); 319 for (i = 0; i < mmdata->cp; i++) { 320 ierr = MatDestroy(&mmdata->mp[i]);CHKERRQ(ierr); 321 } 322 ierr = PetscFree(mmdata->mp);CHKERRQ(ierr); 323 ierr = PetscFree(mmdata->mptmp);CHKERRQ(ierr); 324 ierr = PetscFree(mmdata);CHKERRQ(ierr); 325 PetscFunctionReturn(0); 326 } 327 328 static PetscErrorCode MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C) 329 { 330 MatMatMPIAIJCUSPARSE *mmdata; 331 PetscScalar *tmp; 332 PetscInt i,n; 333 PetscErrorCode ierr; 334 335 PetscFunctionBegin; 336 MatCheckProduct(C,1); 337 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 338 mmdata = (MatMatMPIAIJCUSPARSE*)C->product->data; 339 tmp = mmdata->sf ? mmdata->coo_w : mmdata->coo_v; 340 if (!mmdata->reusesym) { /* update temporary matrices */ 341 if (mmdata->P_oth) { 342 ierr = MatGetBrowsOfAoCols_MPIAIJ(C->product->A,C->product->B,MAT_REUSE_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 343 } 344 if (mmdata->Bloc) { 345 ierr = MatMPIAIJGetLocalMatMerge(C->product->B,MAT_REUSE_MATRIX,NULL,&mmdata->Bloc);CHKERRQ(ierr); 346 } 347 } 348 mmdata->reusesym = PETSC_FALSE; 349 for (i = 0, n = 0; i < mmdata->cp; i++) { 350 Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mmdata->mp[i]->data; 351 const PetscScalar *vv; 352 353 if (!mmdata->mp[i]->ops->productnumeric) SETERRQ1(PetscObjectComm((PetscObject)mmdata->mp[i]),PETSC_ERR_PLIB,"Missing numeric op for %s",MatProductTypes[mmdata->mp[i]->product->type]); 354 ierr = (*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]);CHKERRQ(ierr); 355 if (mmdata->mptmp[i]) continue; 356 /* TODO: add support for using GPU data directly */ 357 ierr = MatSeqAIJGetArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr); 358 ierr = PetscArraycpy(tmp + n,vv,mm->nz);CHKERRQ(ierr); 359 ierr = MatSeqAIJRestoreArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr); 360 n += mm->nz; 361 } 362 if (mmdata->sf) { /* offprocess insertion */ 363 ierr = PetscSFGatherBegin(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr); 364 ierr = PetscSFGatherEnd(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr); 365 } 366 ierr = MatSetValuesCOO(C,mmdata->coo_v,INSERT_VALUES);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 /* Pt * A or A * P */ 371 #define MAX_NUMBER_INTERMEDIATE 4 372 static PetscErrorCode MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C) 373 { 374 Mat_Product *product = C->product; 375 Mat A,P,mp[MAX_NUMBER_INTERMEDIATE]; 376 Mat_MPIAIJ *a,*p; 377 MatMatMPIAIJCUSPARSE *mmdata; 378 ISLocalToGlobalMapping P_oth_l2g = NULL; 379 IS glob = NULL; 380 const PetscInt *globidx,*P_oth_idx; 381 const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE],*rmapa[MAX_NUMBER_INTERMEDIATE]; 382 PetscInt cp = 0,m,n,M,N,ncoo,*coo_i,*coo_j,cmapt[MAX_NUMBER_INTERMEDIATE],rmapt[MAX_NUMBER_INTERMEDIATE],i,j; 383 MatProductType ptype; 384 PetscBool mptmp[MAX_NUMBER_INTERMEDIATE],hasoffproc = PETSC_FALSE; 385 PetscMPIInt size; 386 PetscErrorCode ierr; 387 388 PetscFunctionBegin; 389 MatCheckProduct(C,1); 390 if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 391 ptype = product->type; 392 if (product->A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 393 switch (ptype) { 394 case MATPRODUCT_AB: 395 A = product->A; 396 P = product->B; 397 m = A->rmap->n; 398 n = P->cmap->n; 399 M = A->rmap->N; 400 N = P->cmap->N; 401 break; 402 case MATPRODUCT_AtB: 403 P = product->A; 404 A = product->B; 405 m = P->cmap->n; 406 n = A->cmap->n; 407 M = P->cmap->N; 408 N = A->cmap->N; 409 hasoffproc = PETSC_TRUE; 410 break; 411 case MATPRODUCT_PtAP: 412 A = product->A; 413 P = product->B; 414 m = P->cmap->n; 415 n = P->cmap->n; 416 M = P->cmap->N; 417 N = P->cmap->N; 418 hasoffproc = PETSC_TRUE; 419 break; 420 default: 421 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 422 } 423 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)C),&size);CHKERRQ(ierr); 424 if (size == 1) hasoffproc = PETSC_FALSE; 425 426 for (i=0;i<MAX_NUMBER_INTERMEDIATE;i++) { 427 mp[i] = NULL; 428 mptmp[i] = PETSC_FALSE; 429 rmapt[i] = 0; 430 cmapt[i] = 0; 431 rmapa[i] = NULL; 432 cmapa[i] = NULL; 433 } 434 435 a = (Mat_MPIAIJ*)A->data; 436 p = (Mat_MPIAIJ*)P->data; 437 ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr); 438 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 439 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 440 ierr = MatSetType(C,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 441 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 442 mmdata->reusesym = product->api_user; 443 switch (ptype) { 444 case MATPRODUCT_AB: /* A * P */ 445 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 446 447 if (1) { /* A_diag * P_loc and A_off * P_oth TODO: add customization for this */ 448 /* P is product->B */ 449 ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 450 ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 451 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 452 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 453 mp[cp]->product->api_user = product->api_user; 454 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 455 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 456 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 457 ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 458 rmapt[cp] = 1; 459 cmapt[cp] = 2; 460 cmapa[cp] = globidx; 461 mptmp[cp] = PETSC_FALSE; 462 cp++; 463 } else { /* A_diag * P_diag and A_diag * P_off and A_off * P_oth */ 464 ierr = MatProductCreate(a->A,p->A,NULL,&mp[cp]);CHKERRQ(ierr); 465 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 466 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 467 mp[cp]->product->api_user = product->api_user; 468 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 469 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 470 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 471 rmapt[cp] = 1; 472 cmapt[cp] = 1; 473 mptmp[cp] = PETSC_FALSE; 474 cp++; 475 ierr = MatProductCreate(a->A,p->B,NULL,&mp[cp]);CHKERRQ(ierr); 476 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 477 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 478 mp[cp]->product->api_user = product->api_user; 479 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 480 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 481 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 482 rmapt[cp] = 1; 483 cmapt[cp] = 2; 484 cmapa[cp] = p->garray; 485 mptmp[cp] = PETSC_FALSE; 486 cp++; 487 } 488 if (mmdata->P_oth) { 489 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr); 490 ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 491 ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr); 492 ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr); 493 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 494 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 495 mp[cp]->product->api_user = product->api_user; 496 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 497 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 498 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 499 rmapt[cp] = 1; 500 cmapt[cp] = 2; 501 cmapa[cp] = P_oth_idx; 502 mptmp[cp] = PETSC_FALSE; 503 cp++; 504 } 505 break; 506 case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */ 507 /* A is product->B */ 508 ierr = MatMPIAIJGetLocalMatMerge(A,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 509 ierr = MatProductCreate(p->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 510 ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 511 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 512 mp[cp]->product->api_user = product->api_user; 513 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 514 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 515 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 516 ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 517 rmapt[cp] = 1; 518 cmapt[cp] = 2; 519 cmapa[cp] = globidx; 520 mptmp[cp] = PETSC_FALSE; 521 cp++; 522 ierr = MatProductCreate(p->B,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 523 ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 524 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 525 mp[cp]->product->api_user = product->api_user; 526 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 527 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 528 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 529 rmapt[cp] = 2; 530 rmapa[cp] = p->garray; 531 cmapt[cp] = 2; 532 cmapa[cp] = globidx; 533 mptmp[cp] = PETSC_FALSE; 534 cp++; 535 break; 536 case MATPRODUCT_PtAP: 537 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 538 /* P is product->B */ 539 ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 540 ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 541 ierr = MatProductSetType(mp[cp],MATPRODUCT_PtAP);CHKERRQ(ierr); 542 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 543 mp[cp]->product->api_user = product->api_user; 544 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 545 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 546 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 547 ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 548 rmapt[cp] = 2; 549 rmapa[cp] = globidx; 550 cmapt[cp] = 2; 551 cmapa[cp] = globidx; 552 mptmp[cp] = PETSC_FALSE; 553 cp++; 554 if (mmdata->P_oth) { 555 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr); 556 ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr); 557 ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 558 ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr); 559 ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 560 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 561 mp[cp]->product->api_user = product->api_user; 562 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 563 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 564 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 565 mptmp[cp] = PETSC_TRUE; 566 cp++; 567 ierr = MatProductCreate(mmdata->Bloc,mp[1],NULL,&mp[cp]);CHKERRQ(ierr); 568 ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 569 ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 570 mp[cp]->product->api_user = product->api_user; 571 ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 572 if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 573 ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 574 rmapt[cp] = 2; 575 rmapa[cp] = globidx; 576 cmapt[cp] = 2; 577 cmapa[cp] = P_oth_idx; 578 mptmp[cp] = PETSC_FALSE; 579 cp++; 580 } 581 break; 582 default: 583 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 584 } 585 ierr = PetscMalloc1(cp,&mmdata->mp);CHKERRQ(ierr); 586 for (i = 0; i < cp; i++) mmdata->mp[i] = mp[i]; 587 ierr = PetscMalloc1(cp,&mmdata->mptmp);CHKERRQ(ierr); 588 for (i = 0; i < cp; i++) mmdata->mptmp[i] = mptmp[i]; 589 mmdata->cp = cp; 590 C->product->data = mmdata; 591 C->product->destroy = MatDestroy_MatMatMPIAIJCUSPARSE; 592 C->ops->productnumeric = MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE; 593 594 /* prepare coo coordinates for values insertion */ 595 ncoo = 0; 596 for (cp = 0; cp < mmdata->cp; cp++) { 597 Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data; 598 if (mptmp[cp]) continue; 599 ncoo += mm->nz; 600 } 601 ierr = PetscMalloc2(ncoo,&coo_i,ncoo,&coo_j);CHKERRQ(ierr); 602 ncoo = 0; 603 for (cp = 0; cp < mmdata->cp; cp++) { 604 Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data; 605 PetscInt *coi = coo_i + ncoo; 606 PetscInt *coj = coo_j + ncoo; 607 const PetscInt mr = mp[cp]->rmap->n; 608 const PetscInt *jj = mm->j; 609 const PetscInt *ii = mm->i; 610 611 if (mptmp[cp]) continue; 612 /* rows coo */ 613 if (!rmapt[cp]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"This should not happen"); 614 else if (rmapt[cp] == 1) { /* local to global for owned rows of */ 615 const PetscInt rs = C->rmap->rstart; 616 for (i = 0; i < mr; i++) { 617 const PetscInt gr = i + rs; 618 for (j = ii[i]; j < ii[i+1]; j++) { 619 coi[j] = gr; 620 } 621 } 622 } else { /* offprocess */ 623 const PetscInt *rmap = rmapa[cp]; 624 for (i = 0; i < mr; i++) { 625 const PetscInt gr = rmap[i]; 626 for (j = ii[i]; j < ii[i+1]; j++) { 627 coi[j] = gr; 628 } 629 } 630 } 631 /* columns coo */ 632 if (!cmapt[cp]) { 633 ierr = PetscArraycpy(coj,jj,mm->nz);CHKERRQ(ierr); 634 } else if (cmapt[cp] == 1) { /* local to global for owned columns of P */ 635 const PetscInt cs = P->cmap->rstart; 636 for (j = 0; j < mm->nz; j++) { 637 coj[j] = jj[j] + cs; 638 } 639 } else { /* offdiag */ 640 const PetscInt *cmap = cmapa[cp]; 641 for (j = 0; j < mm->nz; j++) { 642 coj[j] = cmap[jj[j]]; 643 } 644 } 645 ncoo += mm->nz; 646 } 647 if (glob) { 648 ierr = ISRestoreIndices(glob,&globidx);CHKERRQ(ierr); 649 } 650 ierr = ISDestroy(&glob);CHKERRQ(ierr); 651 if (P_oth_l2g) { 652 ierr = ISLocalToGlobalMappingRestoreIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 653 } 654 ierr = ISLocalToGlobalMappingDestroy(&P_oth_l2g);CHKERRQ(ierr); 655 656 if (hasoffproc) { /* offproc values insertion */ 657 const PetscInt *sfdeg; 658 const PetscInt n = P->cmap->n; 659 PetscInt ncoo2,*coo_i2,*coo_j2; 660 661 ierr = PetscSFCreate(PetscObjectComm((PetscObject)C),&mmdata->sf);CHKERRQ(ierr); 662 ierr = PetscSFSetGraphLayout(mmdata->sf,P->cmap,ncoo,NULL,PETSC_OWN_POINTER,coo_i);CHKERRQ(ierr); 663 ierr = PetscSFComputeDegreeBegin(mmdata->sf,&sfdeg);CHKERRQ(ierr); 664 ierr = PetscSFComputeDegreeEnd(mmdata->sf,&sfdeg);CHKERRQ(ierr); 665 for (i = 0, ncoo2 = 0; i < n; i++) ncoo2 += sfdeg[i]; 666 ierr = PetscMalloc2(ncoo2,&coo_i2,ncoo2,&coo_j2);CHKERRQ(ierr); 667 ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr); 668 ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr); 669 ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr); 670 ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr); 671 ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr); 672 ierr = PetscMalloc1(ncoo,&mmdata->coo_w);CHKERRQ(ierr); 673 coo_i = coo_i2; 674 coo_j = coo_j2; 675 ncoo = ncoo2; 676 } 677 678 /* preallocate with COO data */ 679 ierr = MatSetPreallocationCOO(C,ncoo,coo_i,coo_j);CHKERRQ(ierr); 680 ierr = PetscMalloc1(ncoo,&mmdata->coo_v);CHKERRQ(ierr); 681 ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 static PetscErrorCode MatProductSetFromOptions_MPIAIJCUSPARSE(Mat mat) 686 { 687 Mat_Product *product = mat->product; 688 PetscErrorCode ierr; 689 PetscBool Biscusp = PETSC_FALSE; 690 691 PetscFunctionBegin; 692 MatCheckProduct(mat,1); 693 if (!product->B->boundtocpu) { 694 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATMPIAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 695 } 696 if (Biscusp) { 697 switch (product->type) { 698 case MATPRODUCT_AB: 699 case MATPRODUCT_AtB: 700 case MATPRODUCT_PtAP: 701 mat->ops->productsymbolic = MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE; 702 break; 703 default: 704 break; 705 } 706 } 707 if (!mat->ops->productsymbolic) { 708 ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr); 709 } 710 PetscFunctionReturn(0); 711 } 712 713 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 714 { 715 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 716 PetscErrorCode ierr; 717 PetscInt nt; 718 719 PetscFunctionBegin; 720 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 721 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 722 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 723 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 724 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 725 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 726 PetscFunctionReturn(0); 727 } 728 729 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 730 { 731 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 732 PetscErrorCode ierr; 733 734 PetscFunctionBegin; 735 if (A->factortype == MAT_FACTOR_NONE) { 736 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 737 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 738 if (d_mat) { 739 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 740 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 741 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 742 cudaError_t err; 743 PetscScalar *vals; 744 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 745 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 746 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 747 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 748 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 749 } 750 } 751 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 752 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 753 754 PetscFunctionReturn(0); 755 } 756 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 757 { 758 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 759 PetscErrorCode ierr; 760 PetscInt nt; 761 762 PetscFunctionBegin; 763 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 764 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 765 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 766 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 767 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 768 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 769 PetscFunctionReturn(0); 770 } 771 772 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 773 { 774 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 775 PetscErrorCode ierr; 776 PetscInt nt; 777 778 PetscFunctionBegin; 779 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 780 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 781 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 782 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 783 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 784 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 785 PetscFunctionReturn(0); 786 } 787 788 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 789 { 790 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 791 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 792 793 PetscFunctionBegin; 794 switch (op) { 795 case MAT_CUSPARSE_MULT_DIAG: 796 cusparseStruct->diagGPUMatFormat = format; 797 break; 798 case MAT_CUSPARSE_MULT_OFFDIAG: 799 cusparseStruct->offdiagGPUMatFormat = format; 800 break; 801 case MAT_CUSPARSE_ALL: 802 cusparseStruct->diagGPUMatFormat = format; 803 cusparseStruct->offdiagGPUMatFormat = format; 804 break; 805 default: 806 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 807 } 808 PetscFunctionReturn(0); 809 } 810 811 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 812 { 813 MatCUSPARSEStorageFormat format; 814 PetscErrorCode ierr; 815 PetscBool flg; 816 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 817 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 818 819 PetscFunctionBegin; 820 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 821 if (A->factortype==MAT_FACTOR_NONE) { 822 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 823 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 824 if (flg) { 825 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 826 } 827 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 828 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 829 if (flg) { 830 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 831 } 832 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 833 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 834 if (flg) { 835 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 836 } 837 } 838 ierr = PetscOptionsTail();CHKERRQ(ierr); 839 PetscFunctionReturn(0); 840 } 841 842 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 843 { 844 PetscErrorCode ierr; 845 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 846 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 847 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 848 PetscFunctionBegin; 849 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 850 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 851 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 852 } 853 if (d_mat) { 854 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 855 } 856 857 PetscFunctionReturn(0); 858 } 859 860 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 861 { 862 PetscErrorCode ierr; 863 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 864 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 865 cudaError_t err; 866 cusparseStatus_t stat; 867 868 PetscFunctionBegin; 869 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 870 if (cusparseStruct->deviceMat) { 871 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 872 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 873 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 874 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 875 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 876 if (jaca->compressedrow.use) { 877 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 878 } 879 if (jacb->compressedrow.use) { 880 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 881 } 882 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 883 err = cudaFree(d_mat);CHKERRCUDA(err); 884 } 885 try { 886 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 887 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 888 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 889 if (cusparseStruct->stream) { 890 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 891 } 892 delete cusparseStruct->coo_p; 893 delete cusparseStruct->coo_pw; 894 delete cusparseStruct; 895 } catch(char *ex) { 896 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 897 } 898 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 899 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 900 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 901 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 902 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 903 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 904 PetscFunctionReturn(0); 905 } 906 907 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 908 { 909 PetscErrorCode ierr; 910 Mat_MPIAIJ *a; 911 Mat_MPIAIJCUSPARSE *cusparseStruct; 912 cusparseStatus_t stat; 913 Mat A; 914 915 PetscFunctionBegin; 916 if (reuse == MAT_INITIAL_MATRIX) { 917 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 918 } else if (reuse == MAT_REUSE_MATRIX) { 919 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 920 } 921 A = *newmat; 922 923 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 924 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 925 926 a = (Mat_MPIAIJ*)A->data; 927 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 928 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 929 if (a->lvec) { 930 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 931 } 932 933 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 934 a->spptr = new Mat_MPIAIJCUSPARSE; 935 936 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 937 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 938 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 939 cusparseStruct->coo_p = NULL; 940 cusparseStruct->coo_pw = NULL; 941 cusparseStruct->stream = 0; 942 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 943 cusparseStruct->deviceMat = NULL; 944 } 945 946 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 947 A->ops->mult = MatMult_MPIAIJCUSPARSE; 948 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 949 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 950 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 951 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 952 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 953 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJCUSPARSE; 954 955 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 956 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 957 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 958 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 959 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 960 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 961 PetscFunctionReturn(0); 962 } 963 964 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 965 { 966 PetscErrorCode ierr; 967 968 PetscFunctionBegin; 969 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 970 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 971 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 972 PetscFunctionReturn(0); 973 } 974 975 /*@ 976 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 977 (the default parallel PETSc format). This matrix will ultimately pushed down 978 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 979 assembly performance the user should preallocate the matrix storage by setting 980 the parameter nz (or the array nnz). By setting these parameters accurately, 981 performance during matrix assembly can be increased by more than a factor of 50. 982 983 Collective 984 985 Input Parameters: 986 + comm - MPI communicator, set to PETSC_COMM_SELF 987 . m - number of rows 988 . n - number of columns 989 . nz - number of nonzeros per row (same for all rows) 990 - nnz - array containing the number of nonzeros in the various rows 991 (possibly different for each row) or NULL 992 993 Output Parameter: 994 . A - the matrix 995 996 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 997 MatXXXXSetPreallocation() paradigm instead of this routine directly. 998 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 999 1000 Notes: 1001 If nnz is given then nz is ignored 1002 1003 The AIJ format (also called the Yale sparse matrix format or 1004 compressed row storage), is fully compatible with standard Fortran 77 1005 storage. That is, the stored row and column indices can begin at 1006 either one (as in Fortran) or zero. See the users' manual for details. 1007 1008 Specify the preallocated storage with either nz or nnz (not both). 1009 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1010 allocation. For large problems you MUST preallocate memory or you 1011 will get TERRIBLE performance, see the users' manual chapter on matrices. 1012 1013 By default, this format uses inodes (identical nodes) when possible, to 1014 improve numerical efficiency of matrix-vector products and solves. We 1015 search for consecutive rows with the same nonzero structure, thereby 1016 reusing matrix information to achieve increased efficiency. 1017 1018 Level: intermediate 1019 1020 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 1021 @*/ 1022 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 1023 { 1024 PetscErrorCode ierr; 1025 PetscMPIInt size; 1026 1027 PetscFunctionBegin; 1028 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1029 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 1030 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1031 if (size > 1) { 1032 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 1033 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 1034 } else { 1035 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1036 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 1037 } 1038 PetscFunctionReturn(0); 1039 } 1040 1041 /*MC 1042 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 1043 1044 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1045 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1046 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1047 1048 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 1049 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 1050 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 1051 for communicators controlling multiple processes. It is recommended that you call both of 1052 the above preallocation routines for simplicity. 1053 1054 Options Database Keys: 1055 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 1056 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1057 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1058 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1059 1060 Level: beginner 1061 1062 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1063 M 1064 M*/ 1065 1066 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 1067 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 1068 { 1069 #if defined(PETSC_USE_CTABLE) 1070 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 1071 #else 1072 PetscSplitCSRDataStructure **p_d_mat; 1073 PetscMPIInt size,rank; 1074 MPI_Comm comm; 1075 PetscErrorCode ierr; 1076 int *ai,*bi,*aj,*bj; 1077 PetscScalar *aa,*ba; 1078 1079 PetscFunctionBegin; 1080 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1081 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1082 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 1083 if (A->factortype == MAT_FACTOR_NONE) { 1084 CsrMatrix *matrixA,*matrixB=NULL; 1085 if (size == 1) { 1086 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1087 p_d_mat = &cusparsestruct->deviceMat; 1088 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1089 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1090 matrixA = (CsrMatrix*)matstruct->mat; 1091 bi = bj = NULL; ba = NULL; 1092 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 1093 } else { 1094 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1095 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 1096 p_d_mat = &spptr->deviceMat; 1097 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 1098 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 1099 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 1100 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 1101 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 1102 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 1103 matrixA = (CsrMatrix*)matstructA->mat; 1104 matrixB = (CsrMatrix*)matstructB->mat; 1105 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 1106 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 1107 ba = thrust::raw_pointer_cast(matrixB->values->data()); 1108 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 1109 } 1110 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 1111 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 1112 aa = thrust::raw_pointer_cast(matrixA->values->data()); 1113 } else { 1114 *B = NULL; 1115 PetscFunctionReturn(0); 1116 } 1117 // act like MatSetValues because not called on host 1118 if (A->assembled) { 1119 if (A->was_assembled) { 1120 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 1121 } 1122 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 1123 } else { 1124 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 1125 } 1126 if (!*p_d_mat) { 1127 cudaError_t err; 1128 PetscSplitCSRDataStructure *d_mat, h_mat; 1129 Mat_SeqAIJ *jaca; 1130 PetscInt n = A->rmap->n, nnz; 1131 // create and copy 1132 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 1133 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 1134 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 1135 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 1136 if (size == 1) { 1137 jaca = (Mat_SeqAIJ*)A->data; 1138 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 1139 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 1140 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 1141 h_mat.offdiag.a = NULL; 1142 } else { 1143 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1144 Mat_SeqAIJ *jacb; 1145 jaca = (Mat_SeqAIJ*)aij->A->data; 1146 jacb = (Mat_SeqAIJ*)aij->B->data; 1147 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 1148 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 1149 // create colmap - this is ussually done (lazy) in MatSetValues 1150 aij->donotstash = PETSC_TRUE; 1151 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 1152 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 1153 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 1154 aij->colmap[A->cmap->N] = -9; 1155 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 1156 { 1157 PetscInt ii; 1158 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 1159 } 1160 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 1161 // allocate B copy data 1162 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 1163 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 1164 nnz = jacb->i[n]; 1165 1166 if (jacb->compressedrow.use) { 1167 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 1168 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1169 } else h_mat.offdiag.i = bi; 1170 h_mat.offdiag.j = bj; 1171 h_mat.offdiag.a = ba; 1172 1173 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 1174 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1175 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 1176 h_mat.offdiag.n = n; 1177 } 1178 // allocate A copy data 1179 nnz = jaca->i[n]; 1180 h_mat.diag.n = n; 1181 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 1182 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 1183 if (jaca->compressedrow.use) { 1184 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 1185 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1186 } else { 1187 h_mat.diag.i = ai; 1188 } 1189 h_mat.diag.j = aj; 1190 h_mat.diag.a = aa; 1191 // copy pointers and metdata to device 1192 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 1193 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 1194 } else { 1195 *B = *p_d_mat; 1196 } 1197 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 1198 PetscFunctionReturn(0); 1199 #endif 1200 } 1201