1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <petscsf.h> 10 11 struct VecCUDAEquals 12 { 13 template <typename Tuple> 14 __host__ __device__ 15 void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 22 { 23 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 24 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 25 PetscInt n = cusp->coo_nd + cusp->coo_no; 26 PetscErrorCode ierr; 27 28 PetscFunctionBegin; 29 if (cusp->coo_p && v) { 30 thrust::device_ptr<const PetscScalar> d_v; 31 THRUSTARRAY *w = NULL; 32 33 if (isCudaMem(v)) { 34 d_v = thrust::device_pointer_cast(v); 35 } else { 36 w = new THRUSTARRAY(n); 37 w->assign(v,v+n); 38 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 39 d_v = w->data(); 40 } 41 42 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 43 cusp->coo_pw->begin())); 44 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 45 cusp->coo_pw->end())); 46 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 47 thrust::for_each(zibit,zieit,VecCUDAEquals()); 48 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 49 delete w; 50 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 51 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 52 } else { 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 55 } 56 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 57 A->num_ass++; 58 A->assembled = PETSC_TRUE; 59 A->ass_nonzerostate = A->nonzerostate; 60 A->offloadmask = PETSC_OFFLOAD_GPU; 61 PetscFunctionReturn(0); 62 } 63 64 template <typename Tuple> 65 struct IsNotOffDiagT 66 { 67 PetscInt _cstart,_cend; 68 69 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 70 __host__ __device__ 71 inline bool operator()(Tuple t) 72 { 73 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 74 } 75 }; 76 77 struct IsOffDiag 78 { 79 PetscInt _cstart,_cend; 80 81 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 82 __host__ __device__ 83 inline bool operator() (const PetscInt &c) 84 { 85 return c < _cstart || c >= _cend; 86 } 87 }; 88 89 struct GlobToLoc 90 { 91 PetscInt _start; 92 93 GlobToLoc(PetscInt start) : _start(start) {} 94 __host__ __device__ 95 inline PetscInt operator() (const PetscInt &c) 96 { 97 return c - _start; 98 } 99 }; 100 101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 102 { 103 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 104 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 105 PetscErrorCode ierr; 106 PetscInt *jj; 107 size_t noff = 0; 108 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 109 THRUSTINTARRAY d_j(n); 110 ISLocalToGlobalMapping l2g; 111 cudaError_t cerr; 112 113 PetscFunctionBegin; 114 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 115 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 116 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 117 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 118 ierr = PetscFree(b->garray);CHKERRQ(ierr); 119 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 120 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 121 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 122 123 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 124 d_i.assign(coo_i,coo_i+n); 125 d_j.assign(coo_j,coo_j+n); 126 delete cusp->coo_p; 127 delete cusp->coo_pw; 128 cusp->coo_p = NULL; 129 cusp->coo_pw = NULL; 130 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 131 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 132 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 133 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 134 cusp->coo_p = new THRUSTINTARRAY(n); 135 cusp->coo_pw = new THRUSTARRAY(n); 136 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 137 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 138 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 139 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 140 firstoffd = mzipp.get_iterator_tuple().get<1>(); 141 } 142 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 143 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 144 145 /* from global to local */ 146 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 147 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 148 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 149 150 /* copy offdiag column indices to map on the CPU */ 151 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 152 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 153 auto o_j = d_j.begin(); 154 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 155 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 156 thrust::sort(thrust::device,o_j,d_j.end()); 157 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 158 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 159 noff = thrust::distance(o_j,wit); 160 ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 161 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 162 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 163 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 164 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 165 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 166 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 167 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 168 169 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 170 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 171 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 172 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 173 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 174 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 175 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 176 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 177 178 /* GPU memory, cusparse specific call handles it internally */ 179 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 180 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 181 ierr = PetscFree(jj);CHKERRQ(ierr); 182 183 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 184 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 185 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 186 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 187 /* 188 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 190 */ 191 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 192 B->preallocated = PETSC_TRUE; 193 B->nonzerostate++; 194 195 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 196 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 197 B->offloadmask = PETSC_OFFLOAD_CPU; 198 B->assembled = PETSC_FALSE; 199 B->was_assembled = PETSC_FALSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 204 { 205 Mat Ad,Ao; 206 const PetscInt *cmap; 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 211 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 212 if (glob) { 213 PetscInt cst, i, dn, on, *gidx; 214 215 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 216 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 217 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 218 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 219 for (i=0; i<dn; i++) gidx[i] = cst + i; 220 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 221 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 227 { 228 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 229 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 230 PetscErrorCode ierr; 231 PetscInt i; 232 233 PetscFunctionBegin; 234 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 235 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 236 if (PetscDefined(USE_DEBUG) && d_nnz) { 237 for (i=0; i<B->rmap->n; i++) { 238 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239 } 240 } 241 if (PetscDefined(USE_DEBUG) && o_nnz) { 242 for (i=0; i<B->rmap->n; i++) { 243 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 244 } 245 } 246 #if defined(PETSC_USE_CTABLE) 247 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 248 #else 249 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 250 #endif 251 ierr = PetscFree(b->garray);CHKERRQ(ierr); 252 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 253 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 254 /* Because the B will have been resized we simply destroy it and create a new one each time */ 255 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 256 if (!b->A) { 257 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 258 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 259 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 260 } 261 if (!b->B) { 262 PetscMPIInt size; 263 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 264 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 265 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 266 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 267 } 268 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 269 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 270 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 271 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 272 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 273 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 274 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 275 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 276 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 277 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 278 /* Let A, B use b's handle with pre-set stream 279 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 281 */ 282 B->preallocated = PETSC_TRUE; 283 PetscFunctionReturn(0); 284 } 285 286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 287 { 288 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 289 PetscErrorCode ierr; 290 PetscInt nt; 291 292 PetscFunctionBegin; 293 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 294 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 295 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 296 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 297 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 298 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 299 PetscFunctionReturn(0); 300 } 301 302 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 303 { 304 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 305 PetscErrorCode ierr; 306 307 PetscFunctionBegin; 308 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 309 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 310 PetscFunctionReturn(0); 311 } 312 313 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 314 { 315 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 316 PetscErrorCode ierr; 317 PetscInt nt; 318 319 PetscFunctionBegin; 320 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 321 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 322 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 323 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 324 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 325 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 326 PetscFunctionReturn(0); 327 } 328 329 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 330 { 331 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 332 PetscErrorCode ierr; 333 PetscInt nt; 334 335 PetscFunctionBegin; 336 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 337 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 338 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 339 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 340 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 341 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 342 PetscFunctionReturn(0); 343 } 344 345 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 346 { 347 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 348 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 349 350 PetscFunctionBegin; 351 switch (op) { 352 case MAT_CUSPARSE_MULT_DIAG: 353 cusparseStruct->diagGPUMatFormat = format; 354 break; 355 case MAT_CUSPARSE_MULT_OFFDIAG: 356 cusparseStruct->offdiagGPUMatFormat = format; 357 break; 358 case MAT_CUSPARSE_ALL: 359 cusparseStruct->diagGPUMatFormat = format; 360 cusparseStruct->offdiagGPUMatFormat = format; 361 break; 362 default: 363 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 364 } 365 PetscFunctionReturn(0); 366 } 367 368 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 369 { 370 MatCUSPARSEStorageFormat format; 371 PetscErrorCode ierr; 372 PetscBool flg; 373 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 374 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 375 376 PetscFunctionBegin; 377 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 378 if (A->factortype==MAT_FACTOR_NONE) { 379 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 380 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 381 if (flg) { 382 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 383 } 384 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 385 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 386 if (flg) { 387 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 388 } 389 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 390 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 391 if (flg) { 392 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 393 } 394 } 395 ierr = PetscOptionsTail();CHKERRQ(ierr); 396 PetscFunctionReturn(0); 397 } 398 399 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 400 { 401 PetscErrorCode ierr; 402 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 403 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 404 PetscObjectState onnz = A->nonzerostate; 405 406 PetscFunctionBegin; 407 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 408 if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 409 if (onnz != A->nonzerostate && cusp->deviceMat) { 410 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 411 cudaError_t cerr; 412 413 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 414 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 415 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 416 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 417 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 418 ierr = PetscFree(h_mat);CHKERRQ(ierr); 419 cusp->deviceMat = NULL; 420 } 421 PetscFunctionReturn(0); 422 } 423 424 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 425 { 426 PetscErrorCode ierr; 427 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 428 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 429 cusparseStatus_t stat; 430 431 PetscFunctionBegin; 432 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 433 if (cusparseStruct->deviceMat) { 434 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 435 cudaError_t cerr; 436 437 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 438 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 439 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 440 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 441 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 442 ierr = PetscFree(h_mat);CHKERRQ(ierr); 443 } 444 try { 445 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 446 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 447 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 448 /* We want cusparseStruct to use PetscDefaultCudaStream 449 if (cusparseStruct->stream) { 450 cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 451 } 452 */ 453 delete cusparseStruct->coo_p; 454 delete cusparseStruct->coo_pw; 455 delete cusparseStruct; 456 } catch(char *ex) { 457 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 458 } 459 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 460 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 461 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 462 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 463 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 464 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 465 PetscFunctionReturn(0); 466 } 467 468 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 469 { 470 PetscErrorCode ierr; 471 Mat_MPIAIJ *a; 472 Mat_MPIAIJCUSPARSE *cusparseStruct; 473 cusparseStatus_t stat; 474 Mat A; 475 476 PetscFunctionBegin; 477 if (reuse == MAT_INITIAL_MATRIX) { 478 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 479 } else if (reuse == MAT_REUSE_MATRIX) { 480 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 481 } 482 A = *newmat; 483 A->boundtocpu = PETSC_FALSE; 484 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 485 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 486 487 a = (Mat_MPIAIJ*)A->data; 488 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 489 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 490 if (a->lvec) { 491 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 492 } 493 494 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 495 a->spptr = new Mat_MPIAIJCUSPARSE; 496 497 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 498 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 499 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 500 cusparseStruct->coo_p = NULL; 501 cusparseStruct->coo_pw = NULL; 502 cusparseStruct->stream = 0; 503 cusparseStruct->deviceMat = NULL; 504 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 505 } 506 507 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 508 A->ops->mult = MatMult_MPIAIJCUSPARSE; 509 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 510 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 511 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 512 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 513 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 514 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 515 516 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 517 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 518 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 519 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 520 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 521 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 522 PetscFunctionReturn(0); 523 } 524 525 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 526 { 527 PetscErrorCode ierr; 528 529 PetscFunctionBegin; 530 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 531 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 532 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 533 PetscFunctionReturn(0); 534 } 535 536 /*@ 537 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 538 (the default parallel PETSc format). This matrix will ultimately pushed down 539 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 540 assembly performance the user should preallocate the matrix storage by setting 541 the parameter nz (or the array nnz). By setting these parameters accurately, 542 performance during matrix assembly can be increased by more than a factor of 50. 543 544 Collective 545 546 Input Parameters: 547 + comm - MPI communicator, set to PETSC_COMM_SELF 548 . m - number of rows 549 . n - number of columns 550 . nz - number of nonzeros per row (same for all rows) 551 - nnz - array containing the number of nonzeros in the various rows 552 (possibly different for each row) or NULL 553 554 Output Parameter: 555 . A - the matrix 556 557 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 558 MatXXXXSetPreallocation() paradigm instead of this routine directly. 559 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 560 561 Notes: 562 If nnz is given then nz is ignored 563 564 The AIJ format (also called the Yale sparse matrix format or 565 compressed row storage), is fully compatible with standard Fortran 77 566 storage. That is, the stored row and column indices can begin at 567 either one (as in Fortran) or zero. See the users' manual for details. 568 569 Specify the preallocated storage with either nz or nnz (not both). 570 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 571 allocation. For large problems you MUST preallocate memory or you 572 will get TERRIBLE performance, see the users' manual chapter on matrices. 573 574 By default, this format uses inodes (identical nodes) when possible, to 575 improve numerical efficiency of matrix-vector products and solves. We 576 search for consecutive rows with the same nonzero structure, thereby 577 reusing matrix information to achieve increased efficiency. 578 579 Level: intermediate 580 581 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 582 @*/ 583 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 584 { 585 PetscErrorCode ierr; 586 PetscMPIInt size; 587 588 PetscFunctionBegin; 589 ierr = MatCreate(comm,A);CHKERRQ(ierr); 590 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 591 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 592 if (size > 1) { 593 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 594 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 595 } else { 596 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 597 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 598 } 599 PetscFunctionReturn(0); 600 } 601 602 /*MC 603 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 604 605 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 606 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 607 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 608 609 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 610 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 611 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 612 for communicators controlling multiple processes. It is recommended that you call both of 613 the above preallocation routines for simplicity. 614 615 Options Database Keys: 616 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 617 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 618 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 619 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 620 621 Level: beginner 622 623 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 624 M*/ 625 626 /*MC 627 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 628 629 Level: beginner 630 631 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 632 M*/ 633 634 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 635 636 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 637 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 638 { 639 PetscSplitCSRDataStructure d_mat; 640 PetscMPIInt size; 641 PetscErrorCode ierr; 642 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 643 PetscScalar *aa = NULL,*ba = NULL; 644 Mat_SeqAIJ *jaca = NULL; 645 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 646 CsrMatrix *matrixA = NULL,*matrixB = NULL; 647 648 PetscFunctionBegin; 649 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 650 if (A->factortype != MAT_FACTOR_NONE) { 651 *B = NULL; 652 PetscFunctionReturn(0); 653 } 654 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 655 if (size == 1) { 656 PetscBool isseqaij; 657 658 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 659 if (isseqaij) { 660 jaca = (Mat_SeqAIJ*)A->data; 661 if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 662 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 663 d_mat = cusparsestructA->deviceMat; 664 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 665 } else { 666 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 667 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 668 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 669 jaca = (Mat_SeqAIJ*)aij->A->data; 670 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 671 d_mat = spptr->deviceMat; 672 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 673 } 674 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 675 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 676 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 677 matrixA = (CsrMatrix*)matstruct->mat; 678 bi = NULL; 679 bj = NULL; 680 ba = NULL; 681 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 682 } else { 683 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 684 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 685 jaca = (Mat_SeqAIJ*)aij->A->data; 686 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 687 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 688 689 if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 690 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 691 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 692 if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 693 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 694 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 695 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 696 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 697 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 698 if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 699 if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 700 matrixA = (CsrMatrix*)matstructA->mat; 701 matrixB = (CsrMatrix*)matstructB->mat; 702 if (jacb->compressedrow.use) { 703 if (!cusparsestructB->rowoffsets_gpu) { 704 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 705 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 706 } 707 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 708 } else { 709 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 710 } 711 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 712 ba = thrust::raw_pointer_cast(matrixB->values->data()); 713 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 714 d_mat = spptr->deviceMat; 715 } 716 if (jaca->compressedrow.use) { 717 if (!cusparsestructA->rowoffsets_gpu) { 718 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 719 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 720 } 721 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 722 } else { 723 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 724 } 725 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 726 aa = thrust::raw_pointer_cast(matrixA->values->data()); 727 728 if (!d_mat) { 729 cudaError_t cerr; 730 PetscSplitCSRDataStructure h_mat; 731 732 // create and populate strucy on host and copy on device 733 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 734 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 735 cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 736 if (size > 1) { /* need the colmap array */ 737 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 738 int *colmap; 739 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 740 741 if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 742 743 ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 744 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 745 746 h_mat->offdiag.i = bi; 747 h_mat->offdiag.j = bj; 748 h_mat->offdiag.a = ba; 749 h_mat->offdiag.n = A->rmap->n; 750 751 cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr); 752 cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 753 ierr = PetscFree(colmap);CHKERRQ(ierr); 754 } 755 h_mat->rstart = A->rmap->rstart; 756 h_mat->rend = A->rmap->rend; 757 h_mat->cstart = A->cmap->rstart; 758 h_mat->cend = A->cmap->rend; 759 h_mat->N = A->cmap->N; 760 h_mat->diag.i = ai; 761 h_mat->diag.j = aj; 762 h_mat->diag.a = aa; 763 h_mat->diag.n = A->rmap->n; 764 h_mat->rank = PetscGlobalRank; 765 // copy pointers and metadata to device 766 cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 767 ierr = PetscFree(h_mat);CHKERRQ(ierr); 768 } else { 769 ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 770 } 771 *B = d_mat; 772 A->offloadmask = PETSC_OFFLOAD_GPU; 773 PetscFunctionReturn(0); 774 } 775