1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <petscsf.h> 10 11 struct VecCUDAEquals 12 { 13 template <typename Tuple> 14 __host__ __device__ 15 void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 22 { 23 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 24 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 25 PetscInt n = cusp->coo_nd + cusp->coo_no; 26 PetscErrorCode ierr; 27 28 PetscFunctionBegin; 29 if (cusp->coo_p && v) { 30 thrust::device_ptr<const PetscScalar> d_v; 31 THRUSTARRAY *w = NULL; 32 33 if (isCudaMem(v)) { 34 d_v = thrust::device_pointer_cast(v); 35 } else { 36 w = new THRUSTARRAY(n); 37 w->assign(v,v+n); 38 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 39 d_v = w->data(); 40 } 41 42 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 43 cusp->coo_pw->begin())); 44 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 45 cusp->coo_pw->end())); 46 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 47 thrust::for_each(zibit,zieit,VecCUDAEquals()); 48 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 49 delete w; 50 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 51 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 52 } else { 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 55 } 56 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 57 A->num_ass++; 58 A->assembled = PETSC_TRUE; 59 A->ass_nonzerostate = A->nonzerostate; 60 A->offloadmask = PETSC_OFFLOAD_GPU; 61 PetscFunctionReturn(0); 62 } 63 64 template <typename Tuple> 65 struct IsNotOffDiagT 66 { 67 PetscInt _cstart,_cend; 68 69 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 70 __host__ __device__ 71 inline bool operator()(Tuple t) 72 { 73 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 74 } 75 }; 76 77 struct IsOffDiag 78 { 79 PetscInt _cstart,_cend; 80 81 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 82 __host__ __device__ 83 inline bool operator() (const PetscInt &c) 84 { 85 return c < _cstart || c >= _cend; 86 } 87 }; 88 89 struct GlobToLoc 90 { 91 PetscInt _start; 92 93 GlobToLoc(PetscInt start) : _start(start) {} 94 __host__ __device__ 95 inline PetscInt operator() (const PetscInt &c) 96 { 97 return c - _start; 98 } 99 }; 100 101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 102 { 103 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 104 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 105 PetscErrorCode ierr; 106 PetscInt *jj; 107 size_t noff = 0; 108 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 109 THRUSTINTARRAY d_j(n); 110 ISLocalToGlobalMapping l2g; 111 cudaError_t cerr; 112 113 PetscFunctionBegin; 114 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 115 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 116 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 117 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 118 ierr = PetscFree(b->garray);CHKERRQ(ierr); 119 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 120 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 121 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 122 123 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 124 d_i.assign(coo_i,coo_i+n); 125 d_j.assign(coo_j,coo_j+n); 126 delete cusp->coo_p; 127 delete cusp->coo_pw; 128 cusp->coo_p = NULL; 129 cusp->coo_pw = NULL; 130 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 131 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 132 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 133 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 134 cusp->coo_p = new THRUSTINTARRAY(n); 135 cusp->coo_pw = new THRUSTARRAY(n); 136 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 137 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 138 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 139 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 140 firstoffd = mzipp.get_iterator_tuple().get<1>(); 141 } 142 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 143 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 144 145 /* from global to local */ 146 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 147 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 148 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 149 150 /* copy offdiag column indices to map on the CPU */ 151 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 152 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 153 auto o_j = d_j.begin(); 154 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 155 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 156 thrust::sort(thrust::device,o_j,d_j.end()); 157 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 158 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 159 noff = thrust::distance(o_j,wit); 160 ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 161 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 162 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 163 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 164 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 165 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 166 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 167 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 168 169 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 170 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 171 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 172 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 173 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 174 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 175 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 176 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 177 178 /* GPU memory, cusparse specific call handles it internally */ 179 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 180 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 181 ierr = PetscFree(jj);CHKERRQ(ierr); 182 183 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 184 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 185 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 186 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 187 /* 188 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 190 */ 191 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 192 B->preallocated = PETSC_TRUE; 193 B->nonzerostate++; 194 195 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 196 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 197 B->offloadmask = PETSC_OFFLOAD_CPU; 198 B->assembled = PETSC_FALSE; 199 B->was_assembled = PETSC_FALSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 204 { 205 Mat Ad,Ao; 206 const PetscInt *cmap; 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 211 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 212 if (glob) { 213 PetscInt cst, i, dn, on, *gidx; 214 215 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 216 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 217 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 218 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 219 for (i=0; i<dn; i++) gidx[i] = cst + i; 220 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 221 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 227 { 228 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 229 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 230 PetscErrorCode ierr; 231 PetscInt i; 232 233 PetscFunctionBegin; 234 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 235 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 236 if (PetscDefined(USE_DEBUG) && d_nnz) { 237 for (i=0; i<B->rmap->n; i++) { 238 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239 } 240 } 241 if (PetscDefined(USE_DEBUG) && o_nnz) { 242 for (i=0; i<B->rmap->n; i++) { 243 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 244 } 245 } 246 #if defined(PETSC_USE_CTABLE) 247 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 248 #else 249 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 250 #endif 251 ierr = PetscFree(b->garray);CHKERRQ(ierr); 252 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 253 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 254 /* Because the B will have been resized we simply destroy it and create a new one each time */ 255 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 256 if (!b->A) { 257 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 258 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 259 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 260 } 261 if (!b->B) { 262 PetscMPIInt size; 263 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 264 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 265 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 266 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 267 } 268 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 269 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 270 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 271 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 272 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 273 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 274 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 275 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 276 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 277 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 278 /* Let A, B use b's handle with pre-set stream 279 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 281 */ 282 B->preallocated = PETSC_TRUE; 283 PetscFunctionReturn(0); 284 } 285 286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 287 { 288 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 289 PetscErrorCode ierr; 290 PetscInt nt; 291 292 PetscFunctionBegin; 293 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 294 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 295 /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 296 if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 297 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 298 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 299 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 300 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 301 PetscFunctionReturn(0); 302 } 303 304 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 305 { 306 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 311 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 312 PetscFunctionReturn(0); 313 } 314 315 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 316 { 317 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 318 PetscErrorCode ierr; 319 PetscInt nt; 320 321 PetscFunctionBegin; 322 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 323 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 324 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 325 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 326 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 327 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 332 { 333 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 334 PetscErrorCode ierr; 335 PetscInt nt; 336 337 PetscFunctionBegin; 338 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 339 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 340 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 341 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 342 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 343 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 344 PetscFunctionReturn(0); 345 } 346 347 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 348 { 349 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 350 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 351 352 PetscFunctionBegin; 353 switch (op) { 354 case MAT_CUSPARSE_MULT_DIAG: 355 cusparseStruct->diagGPUMatFormat = format; 356 break; 357 case MAT_CUSPARSE_MULT_OFFDIAG: 358 cusparseStruct->offdiagGPUMatFormat = format; 359 break; 360 case MAT_CUSPARSE_ALL: 361 cusparseStruct->diagGPUMatFormat = format; 362 cusparseStruct->offdiagGPUMatFormat = format; 363 break; 364 default: 365 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 366 } 367 PetscFunctionReturn(0); 368 } 369 370 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 371 { 372 MatCUSPARSEStorageFormat format; 373 PetscErrorCode ierr; 374 PetscBool flg; 375 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 376 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 377 378 PetscFunctionBegin; 379 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 380 if (A->factortype==MAT_FACTOR_NONE) { 381 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 382 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 383 if (flg) { 384 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 385 } 386 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 387 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 388 if (flg) { 389 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 390 } 391 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 392 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 393 if (flg) { 394 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 395 } 396 } 397 ierr = PetscOptionsTail();CHKERRQ(ierr); 398 PetscFunctionReturn(0); 399 } 400 401 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 402 { 403 PetscErrorCode ierr; 404 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 405 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 406 PetscObjectState onnz = A->nonzerostate; 407 408 PetscFunctionBegin; 409 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 410 if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 411 if (onnz != A->nonzerostate && cusp->deviceMat) { 412 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 413 cudaError_t cerr; 414 415 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 416 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 417 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 418 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 419 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 420 ierr = PetscFree(h_mat);CHKERRQ(ierr); 421 cusp->deviceMat = NULL; 422 } 423 PetscFunctionReturn(0); 424 } 425 426 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 427 { 428 PetscErrorCode ierr; 429 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 430 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 431 cusparseStatus_t stat; 432 433 PetscFunctionBegin; 434 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 435 if (cusparseStruct->deviceMat) { 436 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 437 cudaError_t cerr; 438 439 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 440 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 441 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 442 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 443 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 444 ierr = PetscFree(h_mat);CHKERRQ(ierr); 445 } 446 try { 447 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 448 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 449 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 450 /* We want cusparseStruct to use PetscDefaultCudaStream 451 if (cusparseStruct->stream) { 452 cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 453 } 454 */ 455 delete cusparseStruct->coo_p; 456 delete cusparseStruct->coo_pw; 457 delete cusparseStruct; 458 } catch(char *ex) { 459 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 460 } 461 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 462 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 463 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 464 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 465 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 466 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 467 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 468 PetscFunctionReturn(0); 469 } 470 471 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 472 { 473 PetscErrorCode ierr; 474 Mat_MPIAIJ *a; 475 Mat_MPIAIJCUSPARSE *cusparseStruct; 476 cusparseStatus_t stat; 477 Mat A; 478 479 PetscFunctionBegin; 480 if (reuse == MAT_INITIAL_MATRIX) { 481 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 482 } else if (reuse == MAT_REUSE_MATRIX) { 483 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 484 } 485 A = *newmat; 486 A->boundtocpu = PETSC_FALSE; 487 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 488 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 489 490 a = (Mat_MPIAIJ*)A->data; 491 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 492 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 493 if (a->lvec) { 494 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 495 } 496 497 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 498 a->spptr = new Mat_MPIAIJCUSPARSE; 499 500 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 501 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 502 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 503 cusparseStruct->coo_p = NULL; 504 cusparseStruct->coo_pw = NULL; 505 cusparseStruct->stream = 0; 506 cusparseStruct->deviceMat = NULL; 507 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 508 } 509 510 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 511 A->ops->mult = MatMult_MPIAIJCUSPARSE; 512 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 513 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 514 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 515 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 516 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 517 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 518 519 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 520 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 521 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 522 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 523 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 524 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 525 #if defined(PETSC_HAVE_HYPRE) 526 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 527 #endif 528 PetscFunctionReturn(0); 529 } 530 531 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 532 { 533 PetscErrorCode ierr; 534 535 PetscFunctionBegin; 536 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 537 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 538 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 539 PetscFunctionReturn(0); 540 } 541 542 /*@ 543 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 544 (the default parallel PETSc format). This matrix will ultimately pushed down 545 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 546 assembly performance the user should preallocate the matrix storage by setting 547 the parameter nz (or the array nnz). By setting these parameters accurately, 548 performance during matrix assembly can be increased by more than a factor of 50. 549 550 Collective 551 552 Input Parameters: 553 + comm - MPI communicator, set to PETSC_COMM_SELF 554 . m - number of rows 555 . n - number of columns 556 . nz - number of nonzeros per row (same for all rows) 557 - nnz - array containing the number of nonzeros in the various rows 558 (possibly different for each row) or NULL 559 560 Output Parameter: 561 . A - the matrix 562 563 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 564 MatXXXXSetPreallocation() paradigm instead of this routine directly. 565 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 566 567 Notes: 568 If nnz is given then nz is ignored 569 570 The AIJ format (also called the Yale sparse matrix format or 571 compressed row storage), is fully compatible with standard Fortran 77 572 storage. That is, the stored row and column indices can begin at 573 either one (as in Fortran) or zero. See the users' manual for details. 574 575 Specify the preallocated storage with either nz or nnz (not both). 576 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 577 allocation. For large problems you MUST preallocate memory or you 578 will get TERRIBLE performance, see the users' manual chapter on matrices. 579 580 By default, this format uses inodes (identical nodes) when possible, to 581 improve numerical efficiency of matrix-vector products and solves. We 582 search for consecutive rows with the same nonzero structure, thereby 583 reusing matrix information to achieve increased efficiency. 584 585 Level: intermediate 586 587 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 588 @*/ 589 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 590 { 591 PetscErrorCode ierr; 592 PetscMPIInt size; 593 594 PetscFunctionBegin; 595 ierr = MatCreate(comm,A);CHKERRQ(ierr); 596 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 597 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 598 if (size > 1) { 599 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 600 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 601 } else { 602 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 603 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 604 } 605 PetscFunctionReturn(0); 606 } 607 608 /*MC 609 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 610 611 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 612 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 613 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 614 615 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 616 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 617 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 618 for communicators controlling multiple processes. It is recommended that you call both of 619 the above preallocation routines for simplicity. 620 621 Options Database Keys: 622 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 623 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 624 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 625 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 626 627 Level: beginner 628 629 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 630 M*/ 631 632 /*MC 633 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 634 635 Level: beginner 636 637 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 638 M*/ 639 640 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 641 642 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 643 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 644 { 645 PetscSplitCSRDataStructure d_mat; 646 PetscMPIInt size; 647 PetscErrorCode ierr; 648 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 649 PetscScalar *aa = NULL,*ba = NULL; 650 Mat_SeqAIJ *jaca = NULL; 651 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 652 CsrMatrix *matrixA = NULL,*matrixB = NULL; 653 654 PetscFunctionBegin; 655 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 656 if (A->factortype != MAT_FACTOR_NONE) { 657 *B = NULL; 658 PetscFunctionReturn(0); 659 } 660 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 661 if (size == 1) { 662 PetscBool isseqaij; 663 664 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 665 if (isseqaij) { 666 jaca = (Mat_SeqAIJ*)A->data; 667 if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 668 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 669 d_mat = cusparsestructA->deviceMat; 670 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 671 } else { 672 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 673 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 674 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 675 jaca = (Mat_SeqAIJ*)aij->A->data; 676 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 677 d_mat = spptr->deviceMat; 678 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 679 } 680 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 681 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 682 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 683 matrixA = (CsrMatrix*)matstruct->mat; 684 bi = NULL; 685 bj = NULL; 686 ba = NULL; 687 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 688 } else { 689 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 690 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 691 jaca = (Mat_SeqAIJ*)aij->A->data; 692 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 693 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 694 695 if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 696 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 697 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 698 if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 699 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 700 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 701 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 702 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 703 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 704 if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 705 if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 706 matrixA = (CsrMatrix*)matstructA->mat; 707 matrixB = (CsrMatrix*)matstructB->mat; 708 if (jacb->compressedrow.use) { 709 if (!cusparsestructB->rowoffsets_gpu) { 710 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 711 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 712 } 713 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 714 } else { 715 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 716 } 717 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 718 ba = thrust::raw_pointer_cast(matrixB->values->data()); 719 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 720 d_mat = spptr->deviceMat; 721 } 722 if (jaca->compressedrow.use) { 723 if (!cusparsestructA->rowoffsets_gpu) { 724 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 725 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 726 } 727 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 728 } else { 729 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 730 } 731 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 732 aa = thrust::raw_pointer_cast(matrixA->values->data()); 733 734 if (!d_mat) { 735 cudaError_t cerr; 736 PetscSplitCSRDataStructure h_mat; 737 738 // create and populate strucy on host and copy on device 739 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 740 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 741 cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 742 if (size > 1) { /* need the colmap array */ 743 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 744 int *colmap; 745 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 746 747 if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 748 749 ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 750 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 751 752 h_mat->offdiag.i = bi; 753 h_mat->offdiag.j = bj; 754 h_mat->offdiag.a = ba; 755 h_mat->offdiag.n = A->rmap->n; 756 757 cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr); 758 cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 759 ierr = PetscFree(colmap);CHKERRQ(ierr); 760 } 761 h_mat->rstart = A->rmap->rstart; 762 h_mat->rend = A->rmap->rend; 763 h_mat->cstart = A->cmap->rstart; 764 h_mat->cend = A->cmap->rend; 765 h_mat->N = A->cmap->N; 766 h_mat->diag.i = ai; 767 h_mat->diag.j = aj; 768 h_mat->diag.a = aa; 769 h_mat->diag.n = A->rmap->n; 770 h_mat->rank = PetscGlobalRank; 771 // copy pointers and metadata to device 772 cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 773 ierr = PetscFree(h_mat);CHKERRQ(ierr); 774 } else { 775 ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 776 } 777 *B = d_mat; 778 A->offloadmask = PETSC_OFFLOAD_GPU; 779 PetscFunctionReturn(0); 780 } 781