1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 cudaError_t cerr; 29 30 PetscFunctionBegin; 31 if (cusp->coo_p && v) { 32 thrust::device_ptr<const PetscScalar> d_v; 33 THRUSTARRAY *w = NULL; 34 35 if (isCudaMem(v)) { 36 d_v = thrust::device_pointer_cast(v); 37 } else { 38 w = new THRUSTARRAY(n); 39 w->assign(v,v+n); 40 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 41 d_v = w->data(); 42 } 43 44 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 45 cusp->coo_pw->begin())); 46 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 47 cusp->coo_pw->end())); 48 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 49 thrust::for_each(zibit,zieit,VecCUDAEquals()); 50 cerr = WaitForCUDA();CHKERRCUDA(cerr); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 60 A->num_ass++; 61 A->assembled = PETSC_TRUE; 62 A->ass_nonzerostate = A->nonzerostate; 63 A->offloadmask = PETSC_OFFLOAD_GPU; 64 PetscFunctionReturn(0); 65 } 66 67 template <typename Tuple> 68 struct IsNotOffDiagT 69 { 70 PetscInt _cstart,_cend; 71 72 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 73 __host__ __device__ 74 inline bool operator()(Tuple t) 75 { 76 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 77 } 78 }; 79 80 struct IsOffDiag 81 { 82 PetscInt _cstart,_cend; 83 84 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 85 __host__ __device__ 86 inline bool operator() (const PetscInt &c) 87 { 88 return c < _cstart || c >= _cend; 89 } 90 }; 91 92 struct GlobToLoc 93 { 94 PetscInt _start; 95 96 GlobToLoc(PetscInt start) : _start(start) {} 97 __host__ __device__ 98 inline PetscInt operator() (const PetscInt &c) 99 { 100 return c - _start; 101 } 102 }; 103 104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 105 { 106 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 107 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 108 PetscErrorCode ierr; 109 PetscInt *jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 cudaError_t cerr; 115 116 PetscFunctionBegin; 117 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 118 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 119 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 120 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 121 ierr = PetscFree(b->garray);CHKERRQ(ierr); 122 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 123 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 124 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 125 126 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 127 d_i.assign(coo_i,coo_i+n); 128 d_j.assign(coo_j,coo_j+n); 129 delete cusp->coo_p; 130 delete cusp->coo_pw; 131 cusp->coo_p = NULL; 132 cusp->coo_pw = NULL; 133 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 134 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 135 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 136 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 137 cusp->coo_p = new THRUSTINTARRAY(n); 138 cusp->coo_pw = new THRUSTARRAY(n); 139 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 140 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 141 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 142 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 143 firstoffd = mzipp.get_iterator_tuple().get<1>(); 144 } 145 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 146 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 147 148 /* from global to local */ 149 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 150 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 151 cerr = WaitForCUDA();CHKERRCUDA(cerr); 152 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 153 154 /* copy offdiag column indices to map on the CPU */ 155 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 156 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 157 auto o_j = d_j.begin(); 158 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 159 thrust::advance(o_j,cusp->coo_nd); 160 thrust::sort(thrust::device,o_j,d_j.end()); 161 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 162 cerr = WaitForCUDA();CHKERRCUDA(cerr); 163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 164 noff = thrust::distance(o_j,wit); 165 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 166 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 167 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 168 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 169 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 170 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 171 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 172 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 173 174 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 175 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 176 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 177 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 178 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 179 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 180 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 181 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 182 183 /* GPU memory, cusparse specific call handles it internally */ 184 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 185 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 186 ierr = PetscFree(jj);CHKERRQ(ierr); 187 188 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 190 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 191 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 192 /* 193 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 194 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 195 */ 196 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 197 B->preallocated = PETSC_TRUE; 198 B->nonzerostate++; 199 200 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 201 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 202 B->offloadmask = PETSC_OFFLOAD_CPU; 203 B->assembled = PETSC_FALSE; 204 B->was_assembled = PETSC_FALSE; 205 PetscFunctionReturn(0); 206 } 207 208 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 209 { 210 Mat Ad,Ao; 211 const PetscInt *cmap; 212 PetscErrorCode ierr; 213 214 PetscFunctionBegin; 215 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 216 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 217 if (glob) { 218 PetscInt cst, i, dn, on, *gidx; 219 220 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 221 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 222 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 223 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 224 for (i=0; i<dn; i++) gidx[i] = cst + i; 225 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 226 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 227 } 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 232 { 233 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 234 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 235 PetscErrorCode ierr; 236 PetscInt i; 237 238 PetscFunctionBegin; 239 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 240 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 241 if (PetscDefined(USE_DEBUG) && d_nnz) { 242 for (i=0; i<B->rmap->n; i++) { 243 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 244 } 245 } 246 if (PetscDefined(USE_DEBUG) && o_nnz) { 247 for (i=0; i<B->rmap->n; i++) { 248 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 249 } 250 } 251 #if defined(PETSC_USE_CTABLE) 252 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 253 #else 254 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 255 #endif 256 ierr = PetscFree(b->garray);CHKERRQ(ierr); 257 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 258 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 259 /* Because the B will have been resized we simply destroy it and create a new one each time */ 260 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 261 if (!b->A) { 262 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 263 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 264 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 265 } 266 if (!b->B) { 267 PetscMPIInt size; 268 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 269 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 270 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 271 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 272 } 273 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 274 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 275 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 276 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 277 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 278 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 279 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 281 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 282 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 283 /* Let A, B use b's handle with pre-set stream 284 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 285 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 286 */ 287 B->preallocated = PETSC_TRUE; 288 PetscFunctionReturn(0); 289 } 290 291 /*@ 292 MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 293 294 Not collective 295 296 Input Parameters: 297 + A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE 298 - gen - the boolean flag 299 300 Level: intermediate 301 302 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE 303 @*/ 304 PetscErrorCode MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen) 305 { 306 PetscErrorCode ierr; 307 PetscBool ismpiaij; 308 309 PetscFunctionBegin; 310 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 311 MatCheckPreallocated(A,1); 312 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr); 313 if (ismpiaij) { 314 Mat A_d,A_o; 315 316 ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr); 317 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr); 318 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr); 319 } else { 320 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr); 321 } 322 PetscFunctionReturn(0); 323 } 324 325 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 326 { 327 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 328 PetscErrorCode ierr; 329 PetscInt nt; 330 331 PetscFunctionBegin; 332 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 333 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 334 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 335 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 336 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 337 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 338 PetscFunctionReturn(0); 339 } 340 341 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 342 { 343 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 344 PetscErrorCode ierr; 345 346 PetscFunctionBegin; 347 if (A->factortype == MAT_FACTOR_NONE) { 348 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 349 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 350 if (d_mat) { 351 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 352 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 353 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 354 cudaError_t err; 355 PetscScalar *vals; 356 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 357 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 358 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 359 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 360 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 361 } 362 } 363 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 364 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 365 366 PetscFunctionReturn(0); 367 } 368 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 369 { 370 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 371 PetscErrorCode ierr; 372 PetscInt nt; 373 374 PetscFunctionBegin; 375 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 376 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 377 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 378 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 379 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 380 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 385 { 386 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 387 PetscErrorCode ierr; 388 PetscInt nt; 389 390 PetscFunctionBegin; 391 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 392 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 393 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 394 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 395 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 396 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 397 PetscFunctionReturn(0); 398 } 399 400 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 401 { 402 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 403 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 404 405 PetscFunctionBegin; 406 switch (op) { 407 case MAT_CUSPARSE_MULT_DIAG: 408 cusparseStruct->diagGPUMatFormat = format; 409 break; 410 case MAT_CUSPARSE_MULT_OFFDIAG: 411 cusparseStruct->offdiagGPUMatFormat = format; 412 break; 413 case MAT_CUSPARSE_ALL: 414 cusparseStruct->diagGPUMatFormat = format; 415 cusparseStruct->offdiagGPUMatFormat = format; 416 break; 417 default: 418 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 419 } 420 PetscFunctionReturn(0); 421 } 422 423 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 424 { 425 MatCUSPARSEStorageFormat format; 426 PetscErrorCode ierr; 427 PetscBool flg; 428 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 429 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 430 431 PetscFunctionBegin; 432 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 433 if (A->factortype==MAT_FACTOR_NONE) { 434 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 435 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 436 if (flg) { 437 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 438 } 439 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 440 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 441 if (flg) { 442 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 443 } 444 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 445 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 446 if (flg) { 447 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 448 } 449 } 450 ierr = PetscOptionsTail();CHKERRQ(ierr); 451 PetscFunctionReturn(0); 452 } 453 454 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 455 { 456 PetscErrorCode ierr; 457 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 458 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 459 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 460 PetscFunctionBegin; 461 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 462 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 463 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 464 #if defined(PETSC_HAVE_NVSHMEM) 465 { 466 PetscMPIInt result; 467 PetscBool useNvshmem = PETSC_FALSE; 468 ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr); 469 if (useNvshmem) { 470 ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr); 471 if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);} 472 } 473 } 474 #endif 475 } 476 if (d_mat) { 477 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 478 } 479 PetscFunctionReturn(0); 480 } 481 482 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 483 { 484 PetscErrorCode ierr; 485 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 486 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 487 cudaError_t err; 488 cusparseStatus_t stat; 489 490 PetscFunctionBegin; 491 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 492 if (cusparseStruct->deviceMat) { 493 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 494 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 495 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 496 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 497 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 498 if (jaca->compressedrow.use) { 499 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 500 } 501 if (jacb->compressedrow.use) { 502 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 503 } 504 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 505 err = cudaFree(d_mat);CHKERRCUDA(err); 506 } 507 try { 508 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 509 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 510 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 511 /* We want cusparseStruct to use PetscDefaultCudaStream 512 if (cusparseStruct->stream) { 513 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 514 } 515 */ 516 delete cusparseStruct->coo_p; 517 delete cusparseStruct->coo_pw; 518 delete cusparseStruct; 519 } catch(char *ex) { 520 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 521 } 522 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 523 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 524 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 525 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 526 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 527 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 528 PetscFunctionReturn(0); 529 } 530 531 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 532 { 533 PetscErrorCode ierr; 534 Mat_MPIAIJ *a; 535 Mat_MPIAIJCUSPARSE *cusparseStruct; 536 cusparseStatus_t stat; 537 Mat A; 538 539 PetscFunctionBegin; 540 if (reuse == MAT_INITIAL_MATRIX) { 541 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 542 } else if (reuse == MAT_REUSE_MATRIX) { 543 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 544 } 545 A = *newmat; 546 A->boundtocpu = PETSC_FALSE; 547 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 548 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 549 550 a = (Mat_MPIAIJ*)A->data; 551 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 552 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 553 if (a->lvec) { 554 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 555 } 556 557 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 558 a->spptr = new Mat_MPIAIJCUSPARSE; 559 560 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 561 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 562 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 563 cusparseStruct->coo_p = NULL; 564 cusparseStruct->coo_pw = NULL; 565 cusparseStruct->stream = 0; /* We should not need cusparseStruct->stream */ 566 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 567 stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 568 cusparseStruct->deviceMat = NULL; 569 } 570 571 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 572 A->ops->mult = MatMult_MPIAIJCUSPARSE; 573 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 574 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 575 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 576 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 577 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 578 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 579 580 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 581 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 582 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 583 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 584 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 585 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 586 PetscFunctionReturn(0); 587 } 588 589 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 590 { 591 PetscErrorCode ierr; 592 593 PetscFunctionBegin; 594 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 595 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 596 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 597 PetscFunctionReturn(0); 598 } 599 600 /*@ 601 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 602 (the default parallel PETSc format). This matrix will ultimately pushed down 603 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 604 assembly performance the user should preallocate the matrix storage by setting 605 the parameter nz (or the array nnz). By setting these parameters accurately, 606 performance during matrix assembly can be increased by more than a factor of 50. 607 608 Collective 609 610 Input Parameters: 611 + comm - MPI communicator, set to PETSC_COMM_SELF 612 . m - number of rows 613 . n - number of columns 614 . nz - number of nonzeros per row (same for all rows) 615 - nnz - array containing the number of nonzeros in the various rows 616 (possibly different for each row) or NULL 617 618 Output Parameter: 619 . A - the matrix 620 621 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 622 MatXXXXSetPreallocation() paradigm instead of this routine directly. 623 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 624 625 Notes: 626 If nnz is given then nz is ignored 627 628 The AIJ format (also called the Yale sparse matrix format or 629 compressed row storage), is fully compatible with standard Fortran 77 630 storage. That is, the stored row and column indices can begin at 631 either one (as in Fortran) or zero. See the users' manual for details. 632 633 Specify the preallocated storage with either nz or nnz (not both). 634 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 635 allocation. For large problems you MUST preallocate memory or you 636 will get TERRIBLE performance, see the users' manual chapter on matrices. 637 638 By default, this format uses inodes (identical nodes) when possible, to 639 improve numerical efficiency of matrix-vector products and solves. We 640 search for consecutive rows with the same nonzero structure, thereby 641 reusing matrix information to achieve increased efficiency. 642 643 Level: intermediate 644 645 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 646 @*/ 647 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 648 { 649 PetscErrorCode ierr; 650 PetscMPIInt size; 651 652 PetscFunctionBegin; 653 ierr = MatCreate(comm,A);CHKERRQ(ierr); 654 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 655 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 656 if (size > 1) { 657 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 658 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 659 } else { 660 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 661 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 662 } 663 PetscFunctionReturn(0); 664 } 665 666 /*MC 667 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 668 669 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 670 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 671 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 672 673 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 674 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 675 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 676 for communicators controlling multiple processes. It is recommended that you call both of 677 the above preallocation routines for simplicity. 678 679 Options Database Keys: 680 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 681 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 682 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 683 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 684 685 Level: beginner 686 687 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 688 M 689 M*/ 690 691 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 692 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 693 { 694 #if defined(PETSC_USE_CTABLE) 695 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 696 #else 697 PetscSplitCSRDataStructure **p_d_mat; 698 PetscMPIInt size,rank; 699 MPI_Comm comm; 700 PetscErrorCode ierr; 701 int *ai,*bi,*aj,*bj; 702 PetscScalar *aa,*ba; 703 704 PetscFunctionBegin; 705 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 706 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 707 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 708 if (A->factortype == MAT_FACTOR_NONE) { 709 CsrMatrix *matrixA,*matrixB=NULL; 710 if (size == 1) { 711 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 712 p_d_mat = &cusparsestruct->deviceMat; 713 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 714 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 715 matrixA = (CsrMatrix*)matstruct->mat; 716 bi = bj = NULL; ba = NULL; 717 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 718 } else { 719 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 720 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 721 p_d_mat = &spptr->deviceMat; 722 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 723 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 724 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 725 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 726 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 727 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 728 matrixA = (CsrMatrix*)matstructA->mat; 729 matrixB = (CsrMatrix*)matstructB->mat; 730 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 731 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 732 ba = thrust::raw_pointer_cast(matrixB->values->data()); 733 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 734 } 735 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 736 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 737 aa = thrust::raw_pointer_cast(matrixA->values->data()); 738 } else { 739 *B = NULL; 740 PetscFunctionReturn(0); 741 } 742 // act like MatSetValues because not called on host 743 if (A->assembled) { 744 if (A->was_assembled) { 745 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 746 } 747 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 748 } else { 749 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 750 } 751 if (!*p_d_mat) { 752 cudaError_t err; 753 PetscSplitCSRDataStructure *d_mat, h_mat; 754 Mat_SeqAIJ *jaca; 755 PetscInt n = A->rmap->n, nnz; 756 // create and copy 757 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 758 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 759 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 760 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 761 if (size == 1) { 762 jaca = (Mat_SeqAIJ*)A->data; 763 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 764 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 765 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 766 h_mat.offdiag.a = NULL; 767 } else { 768 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 769 Mat_SeqAIJ *jacb; 770 jaca = (Mat_SeqAIJ*)aij->A->data; 771 jacb = (Mat_SeqAIJ*)aij->B->data; 772 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 773 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 774 // create colmap - this is ussually done (lazy) in MatSetValues 775 aij->donotstash = PETSC_TRUE; 776 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 777 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 778 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 779 aij->colmap[A->cmap->N] = -9; 780 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 781 { 782 PetscInt ii; 783 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 784 } 785 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 786 // allocate B copy data 787 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 788 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 789 nnz = jacb->i[n]; 790 791 if (jacb->compressedrow.use) { 792 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 793 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 794 } else h_mat.offdiag.i = bi; 795 h_mat.offdiag.j = bj; 796 h_mat.offdiag.a = ba; 797 798 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 799 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 800 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 801 h_mat.offdiag.n = n; 802 } 803 // allocate A copy data 804 nnz = jaca->i[n]; 805 h_mat.diag.n = n; 806 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 807 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 808 if (jaca->compressedrow.use) { 809 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 810 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 811 } else { 812 h_mat.diag.i = ai; 813 } 814 h_mat.diag.j = aj; 815 h_mat.diag.a = aa; 816 // copy pointers and metdata to device 817 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 818 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 819 } else { 820 *B = *p_d_mat; 821 } 822 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 823 PetscFunctionReturn(0); 824 #endif 825 } 826