1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <petscsf.h> 10 11 struct VecCUDAEquals 12 { 13 template <typename Tuple> 14 __host__ __device__ 15 void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 22 { 23 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 24 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 25 PetscInt n = cusp->coo_nd + cusp->coo_no; 26 PetscErrorCode ierr; 27 28 PetscFunctionBegin; 29 if (cusp->coo_p && v) { 30 thrust::device_ptr<const PetscScalar> d_v; 31 THRUSTARRAY *w = NULL; 32 33 if (isCudaMem(v)) { 34 d_v = thrust::device_pointer_cast(v); 35 } else { 36 w = new THRUSTARRAY(n); 37 w->assign(v,v+n); 38 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 39 d_v = w->data(); 40 } 41 42 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 43 cusp->coo_pw->begin())); 44 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 45 cusp->coo_pw->end())); 46 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 47 thrust::for_each(zibit,zieit,VecCUDAEquals()); 48 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 49 delete w; 50 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 51 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 52 } else { 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 55 } 56 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 57 A->num_ass++; 58 A->assembled = PETSC_TRUE; 59 A->ass_nonzerostate = A->nonzerostate; 60 A->offloadmask = PETSC_OFFLOAD_GPU; 61 PetscFunctionReturn(0); 62 } 63 64 template <typename Tuple> 65 struct IsNotOffDiagT 66 { 67 PetscInt _cstart,_cend; 68 69 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 70 __host__ __device__ 71 inline bool operator()(Tuple t) 72 { 73 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 74 } 75 }; 76 77 struct IsOffDiag 78 { 79 PetscInt _cstart,_cend; 80 81 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 82 __host__ __device__ 83 inline bool operator() (const PetscInt &c) 84 { 85 return c < _cstart || c >= _cend; 86 } 87 }; 88 89 struct GlobToLoc 90 { 91 PetscInt _start; 92 93 GlobToLoc(PetscInt start) : _start(start) {} 94 __host__ __device__ 95 inline PetscInt operator() (const PetscInt &c) 96 { 97 return c - _start; 98 } 99 }; 100 101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 102 { 103 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 104 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 105 PetscErrorCode ierr; 106 PetscInt *jj; 107 size_t noff = 0; 108 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 109 THRUSTINTARRAY d_j(n); 110 ISLocalToGlobalMapping l2g; 111 cudaError_t cerr; 112 113 PetscFunctionBegin; 114 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 115 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 116 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 117 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 118 ierr = PetscFree(b->garray);CHKERRQ(ierr); 119 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 120 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 121 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 122 123 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 124 d_i.assign(coo_i,coo_i+n); 125 d_j.assign(coo_j,coo_j+n); 126 delete cusp->coo_p; 127 delete cusp->coo_pw; 128 cusp->coo_p = NULL; 129 cusp->coo_pw = NULL; 130 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 131 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 132 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 133 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 134 cusp->coo_p = new THRUSTINTARRAY(n); 135 cusp->coo_pw = new THRUSTARRAY(n); 136 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 137 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 138 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 139 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 140 firstoffd = mzipp.get_iterator_tuple().get<1>(); 141 } 142 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 143 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 144 145 /* from global to local */ 146 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 147 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 148 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 149 150 /* copy offdiag column indices to map on the CPU */ 151 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 152 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 153 auto o_j = d_j.begin(); 154 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 155 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 156 thrust::sort(thrust::device,o_j,d_j.end()); 157 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 158 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 159 noff = thrust::distance(o_j,wit); 160 ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 161 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 162 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 163 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 164 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 165 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 166 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 167 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 168 169 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 170 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 171 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 172 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 173 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 174 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 175 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 176 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 177 178 /* GPU memory, cusparse specific call handles it internally */ 179 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 180 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 181 ierr = PetscFree(jj);CHKERRQ(ierr); 182 183 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 184 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 185 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 186 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 187 /* 188 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 190 */ 191 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 192 B->preallocated = PETSC_TRUE; 193 B->nonzerostate++; 194 195 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 196 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 197 B->offloadmask = PETSC_OFFLOAD_CPU; 198 B->assembled = PETSC_FALSE; 199 B->was_assembled = PETSC_FALSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 204 { 205 Mat Ad,Ao; 206 const PetscInt *cmap; 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 211 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 212 if (glob) { 213 PetscInt cst, i, dn, on, *gidx; 214 215 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 216 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 217 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 218 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 219 for (i=0; i<dn; i++) gidx[i] = cst + i; 220 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 221 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 227 { 228 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 229 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 230 PetscErrorCode ierr; 231 PetscInt i; 232 233 PetscFunctionBegin; 234 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 235 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 236 if (PetscDefined(USE_DEBUG) && d_nnz) { 237 for (i=0; i<B->rmap->n; i++) { 238 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239 } 240 } 241 if (PetscDefined(USE_DEBUG) && o_nnz) { 242 for (i=0; i<B->rmap->n; i++) { 243 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 244 } 245 } 246 #if defined(PETSC_USE_CTABLE) 247 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 248 #else 249 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 250 #endif 251 ierr = PetscFree(b->garray);CHKERRQ(ierr); 252 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 253 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 254 /* Because the B will have been resized we simply destroy it and create a new one each time */ 255 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 256 if (!b->A) { 257 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 258 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 259 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 260 } 261 if (!b->B) { 262 PetscMPIInt size; 263 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 264 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 265 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 266 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 267 } 268 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 269 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 270 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 271 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 272 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 273 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 274 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 275 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 276 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 277 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 278 /* Let A, B use b's handle with pre-set stream 279 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 281 */ 282 B->preallocated = PETSC_TRUE; 283 PetscFunctionReturn(0); 284 } 285 286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 287 { 288 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 289 PetscErrorCode ierr; 290 PetscInt nt; 291 292 PetscFunctionBegin; 293 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 294 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 295 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 296 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 297 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 298 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 299 PetscFunctionReturn(0); 300 } 301 302 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 303 { 304 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 305 PetscErrorCode ierr; 306 307 PetscFunctionBegin; 308 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 309 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 310 PetscFunctionReturn(0); 311 } 312 313 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 314 { 315 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 316 PetscErrorCode ierr; 317 PetscInt nt; 318 319 PetscFunctionBegin; 320 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 321 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 322 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 323 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 324 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 325 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 326 PetscFunctionReturn(0); 327 } 328 329 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 330 { 331 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 332 PetscErrorCode ierr; 333 PetscInt nt; 334 335 PetscFunctionBegin; 336 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 337 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 338 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 339 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 340 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 341 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 342 PetscFunctionReturn(0); 343 } 344 345 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 346 { 347 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 348 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 349 350 PetscFunctionBegin; 351 switch (op) { 352 case MAT_CUSPARSE_MULT_DIAG: 353 cusparseStruct->diagGPUMatFormat = format; 354 break; 355 case MAT_CUSPARSE_MULT_OFFDIAG: 356 cusparseStruct->offdiagGPUMatFormat = format; 357 break; 358 case MAT_CUSPARSE_ALL: 359 cusparseStruct->diagGPUMatFormat = format; 360 cusparseStruct->offdiagGPUMatFormat = format; 361 break; 362 default: 363 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 364 } 365 PetscFunctionReturn(0); 366 } 367 368 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 369 { 370 MatCUSPARSEStorageFormat format; 371 PetscErrorCode ierr; 372 PetscBool flg; 373 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 374 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 375 376 PetscFunctionBegin; 377 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 378 if (A->factortype==MAT_FACTOR_NONE) { 379 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 380 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 381 if (flg) { 382 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 383 } 384 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 385 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 386 if (flg) { 387 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 388 } 389 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 390 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 391 if (flg) { 392 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 393 } 394 } 395 ierr = PetscOptionsTail();CHKERRQ(ierr); 396 PetscFunctionReturn(0); 397 } 398 399 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 400 { 401 PetscErrorCode ierr; 402 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 403 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 404 PetscObjectState onnz = A->nonzerostate; 405 406 PetscFunctionBegin; 407 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 408 if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 409 if (onnz != A->nonzerostate && cusp->deviceMat) { 410 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 411 cudaError_t cerr; 412 413 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 414 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 415 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 416 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 417 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 418 ierr = PetscFree(h_mat);CHKERRQ(ierr); 419 cusp->deviceMat = NULL; 420 } 421 PetscFunctionReturn(0); 422 } 423 424 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 425 { 426 PetscErrorCode ierr; 427 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 428 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 429 cusparseStatus_t stat; 430 431 PetscFunctionBegin; 432 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 433 if (cusparseStruct->deviceMat) { 434 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 435 cudaError_t cerr; 436 437 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 438 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 439 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 440 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 441 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 442 ierr = PetscFree(h_mat);CHKERRQ(ierr); 443 } 444 try { 445 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 446 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 447 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 448 /* We want cusparseStruct to use PetscDefaultCudaStream 449 if (cusparseStruct->stream) { 450 cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 451 } 452 */ 453 delete cusparseStruct->coo_p; 454 delete cusparseStruct->coo_pw; 455 delete cusparseStruct; 456 } catch(char *ex) { 457 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 458 } 459 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 460 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 461 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 462 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 463 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 464 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 465 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 466 PetscFunctionReturn(0); 467 } 468 469 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 470 { 471 PetscErrorCode ierr; 472 Mat_MPIAIJ *a; 473 Mat_MPIAIJCUSPARSE *cusparseStruct; 474 cusparseStatus_t stat; 475 Mat A; 476 477 PetscFunctionBegin; 478 if (reuse == MAT_INITIAL_MATRIX) { 479 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 480 } else if (reuse == MAT_REUSE_MATRIX) { 481 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 482 } 483 A = *newmat; 484 A->boundtocpu = PETSC_FALSE; 485 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 486 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 487 488 a = (Mat_MPIAIJ*)A->data; 489 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 490 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 491 if (a->lvec) { 492 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 493 } 494 495 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 496 a->spptr = new Mat_MPIAIJCUSPARSE; 497 498 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 499 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 500 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 501 cusparseStruct->coo_p = NULL; 502 cusparseStruct->coo_pw = NULL; 503 cusparseStruct->stream = 0; 504 cusparseStruct->deviceMat = NULL; 505 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 506 } 507 508 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 509 A->ops->mult = MatMult_MPIAIJCUSPARSE; 510 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 511 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 512 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 513 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 514 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 515 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 516 517 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 518 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 519 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 520 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 521 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 522 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 523 #if defined(PETSC_HAVE_HYPRE) 524 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 525 #endif 526 PetscFunctionReturn(0); 527 } 528 529 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 530 { 531 PetscErrorCode ierr; 532 533 PetscFunctionBegin; 534 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 535 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 536 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 537 PetscFunctionReturn(0); 538 } 539 540 /*@ 541 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 542 (the default parallel PETSc format). This matrix will ultimately pushed down 543 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 544 assembly performance the user should preallocate the matrix storage by setting 545 the parameter nz (or the array nnz). By setting these parameters accurately, 546 performance during matrix assembly can be increased by more than a factor of 50. 547 548 Collective 549 550 Input Parameters: 551 + comm - MPI communicator, set to PETSC_COMM_SELF 552 . m - number of rows 553 . n - number of columns 554 . nz - number of nonzeros per row (same for all rows) 555 - nnz - array containing the number of nonzeros in the various rows 556 (possibly different for each row) or NULL 557 558 Output Parameter: 559 . A - the matrix 560 561 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 562 MatXXXXSetPreallocation() paradigm instead of this routine directly. 563 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 564 565 Notes: 566 If nnz is given then nz is ignored 567 568 The AIJ format (also called the Yale sparse matrix format or 569 compressed row storage), is fully compatible with standard Fortran 77 570 storage. That is, the stored row and column indices can begin at 571 either one (as in Fortran) or zero. See the users' manual for details. 572 573 Specify the preallocated storage with either nz or nnz (not both). 574 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 575 allocation. For large problems you MUST preallocate memory or you 576 will get TERRIBLE performance, see the users' manual chapter on matrices. 577 578 By default, this format uses inodes (identical nodes) when possible, to 579 improve numerical efficiency of matrix-vector products and solves. We 580 search for consecutive rows with the same nonzero structure, thereby 581 reusing matrix information to achieve increased efficiency. 582 583 Level: intermediate 584 585 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 586 @*/ 587 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 588 { 589 PetscErrorCode ierr; 590 PetscMPIInt size; 591 592 PetscFunctionBegin; 593 ierr = MatCreate(comm,A);CHKERRQ(ierr); 594 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 595 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 596 if (size > 1) { 597 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 598 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 599 } else { 600 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 601 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 602 } 603 PetscFunctionReturn(0); 604 } 605 606 /*MC 607 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 608 609 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 610 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 611 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 612 613 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 614 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 615 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 616 for communicators controlling multiple processes. It is recommended that you call both of 617 the above preallocation routines for simplicity. 618 619 Options Database Keys: 620 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 621 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 622 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 623 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 624 625 Level: beginner 626 627 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 628 M*/ 629 630 /*MC 631 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 632 633 Level: beginner 634 635 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 636 M*/ 637 638 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 639 640 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 641 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 642 { 643 PetscSplitCSRDataStructure d_mat; 644 PetscMPIInt size; 645 PetscErrorCode ierr; 646 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 647 PetscScalar *aa = NULL,*ba = NULL; 648 Mat_SeqAIJ *jaca = NULL; 649 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 650 CsrMatrix *matrixA = NULL,*matrixB = NULL; 651 652 PetscFunctionBegin; 653 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 654 if (A->factortype != MAT_FACTOR_NONE) { 655 *B = NULL; 656 PetscFunctionReturn(0); 657 } 658 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 659 if (size == 1) { 660 PetscBool isseqaij; 661 662 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 663 if (isseqaij) { 664 jaca = (Mat_SeqAIJ*)A->data; 665 if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 666 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 667 d_mat = cusparsestructA->deviceMat; 668 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 669 } else { 670 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 671 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 672 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 673 jaca = (Mat_SeqAIJ*)aij->A->data; 674 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 675 d_mat = spptr->deviceMat; 676 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 677 } 678 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 679 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 680 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 681 matrixA = (CsrMatrix*)matstruct->mat; 682 bi = NULL; 683 bj = NULL; 684 ba = NULL; 685 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 686 } else { 687 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 688 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 689 jaca = (Mat_SeqAIJ*)aij->A->data; 690 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 691 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 692 693 if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 694 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 695 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 696 if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 697 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 698 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 699 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 700 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 701 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 702 if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 703 if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 704 matrixA = (CsrMatrix*)matstructA->mat; 705 matrixB = (CsrMatrix*)matstructB->mat; 706 if (jacb->compressedrow.use) { 707 if (!cusparsestructB->rowoffsets_gpu) { 708 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 709 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 710 } 711 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 712 } else { 713 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 714 } 715 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 716 ba = thrust::raw_pointer_cast(matrixB->values->data()); 717 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 718 d_mat = spptr->deviceMat; 719 } 720 if (jaca->compressedrow.use) { 721 if (!cusparsestructA->rowoffsets_gpu) { 722 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 723 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 724 } 725 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 726 } else { 727 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 728 } 729 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 730 aa = thrust::raw_pointer_cast(matrixA->values->data()); 731 732 if (!d_mat) { 733 cudaError_t cerr; 734 PetscSplitCSRDataStructure h_mat; 735 736 // create and populate strucy on host and copy on device 737 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 738 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 739 cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 740 if (size > 1) { /* need the colmap array */ 741 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 742 int *colmap; 743 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 744 745 if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 746 747 ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 748 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 749 750 h_mat->offdiag.i = bi; 751 h_mat->offdiag.j = bj; 752 h_mat->offdiag.a = ba; 753 h_mat->offdiag.n = A->rmap->n; 754 755 cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr); 756 cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 757 ierr = PetscFree(colmap);CHKERRQ(ierr); 758 } 759 h_mat->rstart = A->rmap->rstart; 760 h_mat->rend = A->rmap->rend; 761 h_mat->cstart = A->cmap->rstart; 762 h_mat->cend = A->cmap->rend; 763 h_mat->N = A->cmap->N; 764 h_mat->diag.i = ai; 765 h_mat->diag.j = aj; 766 h_mat->diag.a = aa; 767 h_mat->diag.n = A->rmap->n; 768 h_mat->rank = PetscGlobalRank; 769 // copy pointers and metadata to device 770 cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 771 ierr = PetscFree(h_mat);CHKERRQ(ierr); 772 } else { 773 ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 774 } 775 *B = d_mat; 776 A->offloadmask = PETSC_OFFLOAD_GPU; 777 PetscFunctionReturn(0); 778 } 779