1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 25 { 26 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 27 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 28 PetscInt n = cusp->coo_nd + cusp->coo_no; 29 PetscErrorCode ierr; 30 31 PetscFunctionBegin; 32 if (cusp->coo_p && v) { 33 thrust::device_ptr<const PetscScalar> d_v; 34 THRUSTARRAY *w = NULL; 35 36 if (isCudaMem(v)) { 37 d_v = thrust::device_pointer_cast(v); 38 } else { 39 w = new THRUSTARRAY(n); 40 w->assign(v,v+n); 41 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 42 d_v = w->data(); 43 } 44 45 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 46 cusp->coo_pw->begin())); 47 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 48 cusp->coo_pw->end())); 49 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 50 thrust::for_each(zibit,zieit,VecCUDAEquals()); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 60 A->num_ass++; 61 A->assembled = PETSC_TRUE; 62 A->ass_nonzerostate = A->nonzerostate; 63 A->offloadmask = PETSC_OFFLOAD_GPU; 64 PetscFunctionReturn(0); 65 } 66 67 template <typename Tuple> 68 struct IsNotOffDiagT 69 { 70 PetscInt _cstart,_cend; 71 72 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 73 __host__ __device__ 74 inline bool operator()(Tuple t) 75 { 76 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 77 } 78 }; 79 80 struct IsOffDiag 81 { 82 PetscInt _cstart,_cend; 83 84 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 85 __host__ __device__ 86 inline bool operator() (const PetscInt &c) 87 { 88 return c < _cstart || c >= _cend; 89 } 90 }; 91 92 struct GlobToLoc 93 { 94 PetscInt _start; 95 96 GlobToLoc(PetscInt start) : _start(start) {} 97 __host__ __device__ 98 inline PetscInt operator() (const PetscInt &c) 99 { 100 return c - _start; 101 } 102 }; 103 104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 105 { 106 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 107 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 108 PetscErrorCode ierr; 109 PetscInt *jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 cudaError_t cerr; 115 116 PetscFunctionBegin; 117 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 118 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 119 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 120 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 121 ierr = PetscFree(b->garray);CHKERRQ(ierr); 122 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 123 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 124 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 125 126 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 127 d_i.assign(coo_i,coo_i+n); 128 d_j.assign(coo_j,coo_j+n); 129 delete cusp->coo_p; 130 delete cusp->coo_pw; 131 cusp->coo_p = NULL; 132 cusp->coo_pw = NULL; 133 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 134 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 135 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 136 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 137 cusp->coo_p = new THRUSTINTARRAY(n); 138 cusp->coo_pw = new THRUSTARRAY(n); 139 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 140 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 141 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 142 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 143 firstoffd = mzipp.get_iterator_tuple().get<1>(); 144 } 145 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 146 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 147 148 /* from global to local */ 149 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 150 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 151 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 152 153 /* copy offdiag column indices to map on the CPU */ 154 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 155 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 156 auto o_j = d_j.begin(); 157 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 158 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 159 thrust::sort(thrust::device,o_j,d_j.end()); 160 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 161 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 162 noff = thrust::distance(o_j,wit); 163 ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 164 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 165 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 167 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 168 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 169 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",n,cusp->coo_no); 170 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 171 172 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 173 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 174 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 175 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 176 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 177 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 178 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 179 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 180 181 /* GPU memory, cusparse specific call handles it internally */ 182 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 183 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 184 ierr = PetscFree(jj);CHKERRQ(ierr); 185 186 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 187 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 188 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 190 /* 191 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 192 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 193 */ 194 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 195 B->preallocated = PETSC_TRUE; 196 B->nonzerostate++; 197 198 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 199 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 200 B->offloadmask = PETSC_OFFLOAD_CPU; 201 B->assembled = PETSC_FALSE; 202 B->was_assembled = PETSC_FALSE; 203 PetscFunctionReturn(0); 204 } 205 206 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 207 { 208 Mat Ad,Ao; 209 const PetscInt *cmap; 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 214 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 215 if (glob) { 216 PetscInt cst, i, dn, on, *gidx; 217 218 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 219 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 220 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 221 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 222 for (i=0; i<dn; i++) gidx[i] = cst + i; 223 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 224 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 225 } 226 PetscFunctionReturn(0); 227 } 228 229 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 230 { 231 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 232 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 233 PetscErrorCode ierr; 234 PetscInt i; 235 236 PetscFunctionBegin; 237 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 238 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 239 if (PetscDefined(USE_DEBUG) && d_nnz) { 240 for (i=0; i<B->rmap->n; i++) { 241 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 242 } 243 } 244 if (PetscDefined(USE_DEBUG) && o_nnz) { 245 for (i=0; i<B->rmap->n; i++) { 246 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 247 } 248 } 249 #if defined(PETSC_USE_CTABLE) 250 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 251 #else 252 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 253 #endif 254 ierr = PetscFree(b->garray);CHKERRQ(ierr); 255 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 256 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 257 /* Because the B will have been resized we simply destroy it and create a new one each time */ 258 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 259 if (!b->A) { 260 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 261 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 262 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 263 } 264 if (!b->B) { 265 PetscMPIInt size; 266 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 267 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 268 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 269 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 270 } 271 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 272 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 273 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 274 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 275 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 276 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 277 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 278 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 279 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 281 /* Let A, B use b's handle with pre-set stream 282 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 283 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 284 */ 285 B->preallocated = PETSC_TRUE; 286 PetscFunctionReturn(0); 287 } 288 289 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 290 { 291 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 292 PetscErrorCode ierr; 293 PetscInt nt; 294 295 PetscFunctionBegin; 296 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 297 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 298 /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 299 if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 300 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 301 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 302 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 303 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 304 PetscFunctionReturn(0); 305 } 306 307 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 308 { 309 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 310 PetscErrorCode ierr; 311 312 PetscFunctionBegin; 313 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 314 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 315 PetscFunctionReturn(0); 316 } 317 318 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 319 { 320 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 321 PetscErrorCode ierr; 322 PetscInt nt; 323 324 PetscFunctionBegin; 325 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 326 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 327 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 328 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 329 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 330 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 335 { 336 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 337 PetscErrorCode ierr; 338 PetscInt nt; 339 340 PetscFunctionBegin; 341 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 342 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt); 343 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 344 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 345 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 346 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 347 PetscFunctionReturn(0); 348 } 349 350 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 351 { 352 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 353 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 354 355 PetscFunctionBegin; 356 switch (op) { 357 case MAT_CUSPARSE_MULT_DIAG: 358 cusparseStruct->diagGPUMatFormat = format; 359 break; 360 case MAT_CUSPARSE_MULT_OFFDIAG: 361 cusparseStruct->offdiagGPUMatFormat = format; 362 break; 363 case MAT_CUSPARSE_ALL: 364 cusparseStruct->diagGPUMatFormat = format; 365 cusparseStruct->offdiagGPUMatFormat = format; 366 break; 367 default: 368 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 369 } 370 PetscFunctionReturn(0); 371 } 372 373 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 374 { 375 MatCUSPARSEStorageFormat format; 376 PetscErrorCode ierr; 377 PetscBool flg; 378 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 379 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 380 381 PetscFunctionBegin; 382 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 383 if (A->factortype==MAT_FACTOR_NONE) { 384 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 385 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 386 if (flg) { 387 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 388 } 389 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 390 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 391 if (flg) { 392 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 393 } 394 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 395 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 396 if (flg) { 397 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 398 } 399 } 400 ierr = PetscOptionsTail();CHKERRQ(ierr); 401 PetscFunctionReturn(0); 402 } 403 404 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 405 { 406 PetscErrorCode ierr; 407 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 408 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 409 PetscObjectState onnz = A->nonzerostate; 410 411 PetscFunctionBegin; 412 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 413 if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 414 if (onnz != A->nonzerostate && cusp->deviceMat) { 415 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 416 cudaError_t cerr; 417 418 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 419 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 420 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 421 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 422 if (h_mat->allocated_indices) { 423 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 424 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 425 if (h_mat->offdiag.j) { 426 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 427 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 428 } 429 } 430 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 431 ierr = PetscFree(h_mat);CHKERRQ(ierr); 432 cusp->deviceMat = NULL; 433 } 434 PetscFunctionReturn(0); 435 } 436 437 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 438 { 439 PetscErrorCode ierr; 440 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 441 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 442 cusparseStatus_t stat; 443 444 PetscFunctionBegin; 445 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 446 if (cusparseStruct->deviceMat) { 447 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 448 cudaError_t cerr; 449 450 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 451 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 452 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 453 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 454 if (h_mat->allocated_indices) { 455 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 456 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 457 if (h_mat->offdiag.j) { 458 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 459 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 460 } 461 } 462 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 463 ierr = PetscFree(h_mat);CHKERRQ(ierr); 464 } 465 try { 466 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 467 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 468 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 469 /* We want cusparseStruct to use PetscDefaultCudaStream 470 if (cusparseStruct->stream) { 471 cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 472 } 473 */ 474 delete cusparseStruct->coo_p; 475 delete cusparseStruct->coo_pw; 476 delete cusparseStruct; 477 } catch(char *ex) { 478 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 479 } 480 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 481 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 482 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 483 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 484 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 485 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 486 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 487 PetscFunctionReturn(0); 488 } 489 490 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 491 { 492 PetscErrorCode ierr; 493 Mat_MPIAIJ *a; 494 Mat_MPIAIJCUSPARSE *cusparseStruct; 495 cusparseStatus_t stat; 496 Mat A; 497 498 PetscFunctionBegin; 499 if (reuse == MAT_INITIAL_MATRIX) { 500 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 501 } else if (reuse == MAT_REUSE_MATRIX) { 502 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 503 } 504 A = *newmat; 505 A->boundtocpu = PETSC_FALSE; 506 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 507 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 508 509 a = (Mat_MPIAIJ*)A->data; 510 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 511 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 512 if (a->lvec) { 513 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 514 } 515 516 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 517 a->spptr = new Mat_MPIAIJCUSPARSE; 518 519 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 520 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 521 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 522 cusparseStruct->coo_p = NULL; 523 cusparseStruct->coo_pw = NULL; 524 cusparseStruct->stream = 0; 525 cusparseStruct->deviceMat = NULL; 526 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 527 } 528 529 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 530 A->ops->mult = MatMult_MPIAIJCUSPARSE; 531 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 532 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 533 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 534 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 535 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 536 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 537 538 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 539 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 540 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 541 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 542 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 543 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 544 #if defined(PETSC_HAVE_HYPRE) 545 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 546 #endif 547 PetscFunctionReturn(0); 548 } 549 550 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 551 { 552 PetscErrorCode ierr; 553 554 PetscFunctionBegin; 555 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 556 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 557 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 558 PetscFunctionReturn(0); 559 } 560 561 /*@ 562 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 563 (the default parallel PETSc format). This matrix will ultimately pushed down 564 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 565 assembly performance the user should preallocate the matrix storage by setting 566 the parameter nz (or the array nnz). By setting these parameters accurately, 567 performance during matrix assembly can be increased by more than a factor of 50. 568 569 Collective 570 571 Input Parameters: 572 + comm - MPI communicator, set to PETSC_COMM_SELF 573 . m - number of rows 574 . n - number of columns 575 . nz - number of nonzeros per row (same for all rows) 576 - nnz - array containing the number of nonzeros in the various rows 577 (possibly different for each row) or NULL 578 579 Output Parameter: 580 . A - the matrix 581 582 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 583 MatXXXXSetPreallocation() paradigm instead of this routine directly. 584 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 585 586 Notes: 587 If nnz is given then nz is ignored 588 589 The AIJ format (also called the Yale sparse matrix format or 590 compressed row storage), is fully compatible with standard Fortran 77 591 storage. That is, the stored row and column indices can begin at 592 either one (as in Fortran) or zero. See the users' manual for details. 593 594 Specify the preallocated storage with either nz or nnz (not both). 595 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 596 allocation. For large problems you MUST preallocate memory or you 597 will get TERRIBLE performance, see the users' manual chapter on matrices. 598 599 By default, this format uses inodes (identical nodes) when possible, to 600 improve numerical efficiency of matrix-vector products and solves. We 601 search for consecutive rows with the same nonzero structure, thereby 602 reusing matrix information to achieve increased efficiency. 603 604 Level: intermediate 605 606 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 607 @*/ 608 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 609 { 610 PetscErrorCode ierr; 611 PetscMPIInt size; 612 613 PetscFunctionBegin; 614 ierr = MatCreate(comm,A);CHKERRQ(ierr); 615 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 616 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 617 if (size > 1) { 618 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 619 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 620 } else { 621 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 622 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 623 } 624 PetscFunctionReturn(0); 625 } 626 627 /*MC 628 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 629 630 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 631 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 632 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 633 634 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 635 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 636 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 637 for communicators controlling multiple processes. It is recommended that you call both of 638 the above preallocation routines for simplicity. 639 640 Options Database Keys: 641 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 642 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 643 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 644 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 645 646 Level: beginner 647 648 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 649 M*/ 650 651 /*MC 652 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 653 654 Level: beginner 655 656 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 657 M*/ 658 659 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 660 661 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 662 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 663 { 664 PetscSplitCSRDataStructure d_mat; 665 PetscMPIInt size; 666 PetscErrorCode ierr; 667 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 668 PetscScalar *aa = NULL,*ba = NULL; 669 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 670 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 671 CsrMatrix *matrixA = NULL,*matrixB = NULL; 672 673 PetscFunctionBegin; 674 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 675 if (A->factortype != MAT_FACTOR_NONE) { 676 *B = NULL; 677 PetscFunctionReturn(0); 678 } 679 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 680 // get jaca 681 if (size == 1) { 682 PetscBool isseqaij; 683 684 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 685 if (isseqaij) { 686 jaca = (Mat_SeqAIJ*)A->data; 687 if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 688 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 689 d_mat = cusparsestructA->deviceMat; 690 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 691 } else { 692 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 693 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 694 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 695 jaca = (Mat_SeqAIJ*)aij->A->data; 696 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 697 d_mat = spptr->deviceMat; 698 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 699 } 700 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 701 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 702 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 703 matrixA = (CsrMatrix*)matstruct->mat; 704 bi = NULL; 705 bj = NULL; 706 ba = NULL; 707 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 708 } else { 709 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 710 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 711 jaca = (Mat_SeqAIJ*)aij->A->data; 712 jacb = (Mat_SeqAIJ*)aij->B->data; 713 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 714 715 if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 716 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 717 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 718 if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 719 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 720 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 721 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 722 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 723 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 724 if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 725 if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 726 matrixA = (CsrMatrix*)matstructA->mat; 727 matrixB = (CsrMatrix*)matstructB->mat; 728 if (jacb->compressedrow.use) { 729 if (!cusparsestructB->rowoffsets_gpu) { 730 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 731 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 732 } 733 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 734 } else { 735 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 736 } 737 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 738 ba = thrust::raw_pointer_cast(matrixB->values->data()); 739 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 740 d_mat = spptr->deviceMat; 741 } 742 if (jaca->compressedrow.use) { 743 if (!cusparsestructA->rowoffsets_gpu) { 744 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 745 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 746 } 747 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 748 } else { 749 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 750 } 751 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 752 aa = thrust::raw_pointer_cast(matrixA->values->data()); 753 754 if (!d_mat) { 755 cudaError_t cerr; 756 PetscSplitCSRDataStructure h_mat; 757 758 // create and populate strucy on host and copy on device 759 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 760 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 761 cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 762 if (size > 1) { /* need the colmap array */ 763 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 764 PetscInt *colmap; 765 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 766 767 if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 768 769 ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 770 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 771 #if defined(PETSC_USE_64BIT_INDICES) 772 { // have to make a long version of these 773 int *h_bi32, *h_bj32; 774 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 775 ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 776 cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 777 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 778 cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 779 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 780 781 cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 782 cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 783 cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 784 cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 785 786 h_mat->offdiag.i = d_bi64; 787 h_mat->offdiag.j = d_bj64; 788 h_mat->allocated_indices = PETSC_TRUE; 789 790 ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 791 } 792 #else 793 h_mat->offdiag.i = (PetscInt*)bi; 794 h_mat->offdiag.j = (PetscInt*)bj; 795 h_mat->allocated_indices = PETSC_FALSE; 796 #endif 797 h_mat->offdiag.a = ba; 798 h_mat->offdiag.n = A->rmap->n; 799 800 cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 801 cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 802 ierr = PetscFree(colmap);CHKERRQ(ierr); 803 } 804 h_mat->rstart = A->rmap->rstart; 805 h_mat->rend = A->rmap->rend; 806 h_mat->cstart = A->cmap->rstart; 807 h_mat->cend = A->cmap->rend; 808 h_mat->M = A->cmap->N; 809 #if defined(PETSC_USE_64BIT_INDICES) 810 { 811 if (sizeof(PetscInt) != 8) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 812 int *h_ai32, *h_aj32; 813 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 814 ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 815 cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 816 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 817 cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 818 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 819 820 cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 821 cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 822 cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 823 cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 824 825 h_mat->diag.i = d_ai64; 826 h_mat->diag.j = d_aj64; 827 h_mat->allocated_indices = PETSC_TRUE; 828 829 ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 830 } 831 #else 832 h_mat->diag.i = (PetscInt*)ai; 833 h_mat->diag.j = (PetscInt*)aj; 834 h_mat->allocated_indices = PETSC_FALSE; 835 #endif 836 h_mat->diag.a = aa; 837 h_mat->diag.n = A->rmap->n; 838 h_mat->rank = PetscGlobalRank; 839 // copy pointers and metadata to device 840 cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 841 ierr = PetscFree(h_mat);CHKERRQ(ierr); 842 } else { 843 ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 844 } 845 *B = d_mat; 846 A->offloadmask = PETSC_OFFLOAD_GPU; 847 PetscFunctionReturn(0); 848 } 849