1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <petscsf.h> 10 11 struct VecCUDAEquals 12 { 13 template <typename Tuple> 14 __host__ __device__ 15 void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 22 { 23 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 24 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 25 PetscInt n = cusp->coo_nd + cusp->coo_no; 26 PetscErrorCode ierr; 27 28 PetscFunctionBegin; 29 if (cusp->coo_p && v) { 30 thrust::device_ptr<const PetscScalar> d_v; 31 THRUSTARRAY *w = NULL; 32 33 if (isCudaMem(v)) { 34 d_v = thrust::device_pointer_cast(v); 35 } else { 36 w = new THRUSTARRAY(n); 37 w->assign(v,v+n); 38 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 39 d_v = w->data(); 40 } 41 42 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 43 cusp->coo_pw->begin())); 44 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 45 cusp->coo_pw->end())); 46 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 47 thrust::for_each(zibit,zieit,VecCUDAEquals()); 48 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 49 delete w; 50 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 51 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 52 } else { 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 55 } 56 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 57 A->num_ass++; 58 A->assembled = PETSC_TRUE; 59 A->ass_nonzerostate = A->nonzerostate; 60 A->offloadmask = PETSC_OFFLOAD_GPU; 61 PetscFunctionReturn(0); 62 } 63 64 template <typename Tuple> 65 struct IsNotOffDiagT 66 { 67 PetscInt _cstart,_cend; 68 69 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 70 __host__ __device__ 71 inline bool operator()(Tuple t) 72 { 73 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 74 } 75 }; 76 77 struct IsOffDiag 78 { 79 PetscInt _cstart,_cend; 80 81 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 82 __host__ __device__ 83 inline bool operator() (const PetscInt &c) 84 { 85 return c < _cstart || c >= _cend; 86 } 87 }; 88 89 struct GlobToLoc 90 { 91 PetscInt _start; 92 93 GlobToLoc(PetscInt start) : _start(start) {} 94 __host__ __device__ 95 inline PetscInt operator() (const PetscInt &c) 96 { 97 return c - _start; 98 } 99 }; 100 101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 102 { 103 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 104 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 105 PetscErrorCode ierr; 106 PetscInt *jj; 107 size_t noff = 0; 108 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 109 THRUSTINTARRAY d_j(n); 110 ISLocalToGlobalMapping l2g; 111 cudaError_t cerr; 112 113 PetscFunctionBegin; 114 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 115 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 116 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 117 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 118 ierr = PetscFree(b->garray);CHKERRQ(ierr); 119 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 120 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 121 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 122 123 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 124 d_i.assign(coo_i,coo_i+n); 125 d_j.assign(coo_j,coo_j+n); 126 delete cusp->coo_p; 127 delete cusp->coo_pw; 128 cusp->coo_p = NULL; 129 cusp->coo_pw = NULL; 130 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 131 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 132 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 133 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 134 cusp->coo_p = new THRUSTINTARRAY(n); 135 cusp->coo_pw = new THRUSTARRAY(n); 136 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 137 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 138 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 139 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 140 firstoffd = mzipp.get_iterator_tuple().get<1>(); 141 } 142 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 143 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 144 145 /* from global to local */ 146 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 147 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 148 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 149 150 /* copy offdiag column indices to map on the CPU */ 151 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 152 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 153 auto o_j = d_j.begin(); 154 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 155 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 156 thrust::sort(thrust::device,o_j,d_j.end()); 157 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 158 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 159 noff = thrust::distance(o_j,wit); 160 ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 161 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 162 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 163 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 164 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 165 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 166 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 167 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 168 169 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 170 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 171 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 172 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 173 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 174 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 175 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 176 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 177 178 /* GPU memory, cusparse specific call handles it internally */ 179 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 180 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 181 ierr = PetscFree(jj);CHKERRQ(ierr); 182 183 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 184 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 185 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 186 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 187 /* 188 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 190 */ 191 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 192 B->preallocated = PETSC_TRUE; 193 B->nonzerostate++; 194 195 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 196 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 197 B->offloadmask = PETSC_OFFLOAD_CPU; 198 B->assembled = PETSC_FALSE; 199 B->was_assembled = PETSC_FALSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 204 { 205 Mat Ad,Ao; 206 const PetscInt *cmap; 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 211 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 212 if (glob) { 213 PetscInt cst, i, dn, on, *gidx; 214 215 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 216 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 217 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 218 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 219 for (i=0; i<dn; i++) gidx[i] = cst + i; 220 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 221 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 227 { 228 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 229 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 230 PetscErrorCode ierr; 231 PetscInt i; 232 233 PetscFunctionBegin; 234 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 235 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 236 if (PetscDefined(USE_DEBUG) && d_nnz) { 237 for (i=0; i<B->rmap->n; i++) { 238 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239 } 240 } 241 if (PetscDefined(USE_DEBUG) && o_nnz) { 242 for (i=0; i<B->rmap->n; i++) { 243 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 244 } 245 } 246 #if defined(PETSC_USE_CTABLE) 247 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 248 #else 249 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 250 #endif 251 ierr = PetscFree(b->garray);CHKERRQ(ierr); 252 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 253 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 254 /* Because the B will have been resized we simply destroy it and create a new one each time */ 255 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 256 if (!b->A) { 257 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 258 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 259 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 260 } 261 if (!b->B) { 262 PetscMPIInt size; 263 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 264 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 265 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 266 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 267 } 268 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 269 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 270 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 271 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 272 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 273 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 274 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 275 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 276 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 277 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 278 /* Let A, B use b's handle with pre-set stream 279 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 281 */ 282 B->preallocated = PETSC_TRUE; 283 PetscFunctionReturn(0); 284 } 285 286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 287 { 288 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 289 PetscErrorCode ierr; 290 PetscInt nt; 291 292 PetscFunctionBegin; 293 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 294 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 295 /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 296 if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 297 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 298 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 299 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 300 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 301 PetscFunctionReturn(0); 302 } 303 304 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 305 { 306 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 311 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 312 PetscFunctionReturn(0); 313 } 314 315 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 316 { 317 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 318 PetscErrorCode ierr; 319 PetscInt nt; 320 321 PetscFunctionBegin; 322 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 323 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 324 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 325 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 326 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 327 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 332 { 333 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 334 PetscErrorCode ierr; 335 PetscInt nt; 336 337 PetscFunctionBegin; 338 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 339 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 340 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 341 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 342 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 343 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 344 PetscFunctionReturn(0); 345 } 346 347 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 348 { 349 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 350 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 351 352 PetscFunctionBegin; 353 switch (op) { 354 case MAT_CUSPARSE_MULT_DIAG: 355 cusparseStruct->diagGPUMatFormat = format; 356 break; 357 case MAT_CUSPARSE_MULT_OFFDIAG: 358 cusparseStruct->offdiagGPUMatFormat = format; 359 break; 360 case MAT_CUSPARSE_ALL: 361 cusparseStruct->diagGPUMatFormat = format; 362 cusparseStruct->offdiagGPUMatFormat = format; 363 break; 364 default: 365 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 366 } 367 PetscFunctionReturn(0); 368 } 369 370 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 371 { 372 MatCUSPARSEStorageFormat format; 373 PetscErrorCode ierr; 374 PetscBool flg; 375 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 376 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 377 378 PetscFunctionBegin; 379 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 380 if (A->factortype==MAT_FACTOR_NONE) { 381 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 382 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 383 if (flg) { 384 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 385 } 386 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 387 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 388 if (flg) { 389 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 390 } 391 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 392 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 393 if (flg) { 394 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 395 } 396 } 397 ierr = PetscOptionsTail();CHKERRQ(ierr); 398 PetscFunctionReturn(0); 399 } 400 401 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 402 { 403 PetscErrorCode ierr; 404 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 405 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 406 PetscObjectState onnz = A->nonzerostate; 407 408 PetscFunctionBegin; 409 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 410 if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 411 if (onnz != A->nonzerostate && cusp->deviceMat) { 412 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 413 cudaError_t cerr; 414 415 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 416 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 417 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 418 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 419 if (h_mat->allocated_indices) { 420 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 421 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 422 if (h_mat->offdiag.j) { 423 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 424 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 425 } 426 } 427 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 428 ierr = PetscFree(h_mat);CHKERRQ(ierr); 429 cusp->deviceMat = NULL; 430 } 431 PetscFunctionReturn(0); 432 } 433 434 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 435 { 436 PetscErrorCode ierr; 437 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 438 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 439 cusparseStatus_t stat; 440 441 PetscFunctionBegin; 442 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 443 if (cusparseStruct->deviceMat) { 444 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 445 cudaError_t cerr; 446 447 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 448 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 449 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 450 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 451 if (h_mat->allocated_indices) { 452 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 453 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 454 if (h_mat->offdiag.j) { 455 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 456 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 457 } 458 } 459 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 460 ierr = PetscFree(h_mat);CHKERRQ(ierr); 461 } 462 try { 463 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 464 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 465 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 466 /* We want cusparseStruct to use PetscDefaultCudaStream 467 if (cusparseStruct->stream) { 468 cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 469 } 470 */ 471 delete cusparseStruct->coo_p; 472 delete cusparseStruct->coo_pw; 473 delete cusparseStruct; 474 } catch(char *ex) { 475 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 476 } 477 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 478 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 479 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 480 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 481 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 482 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 483 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 488 { 489 PetscErrorCode ierr; 490 Mat_MPIAIJ *a; 491 Mat_MPIAIJCUSPARSE *cusparseStruct; 492 cusparseStatus_t stat; 493 Mat A; 494 495 PetscFunctionBegin; 496 if (reuse == MAT_INITIAL_MATRIX) { 497 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 498 } else if (reuse == MAT_REUSE_MATRIX) { 499 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 500 } 501 A = *newmat; 502 A->boundtocpu = PETSC_FALSE; 503 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 504 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 505 506 a = (Mat_MPIAIJ*)A->data; 507 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 508 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 509 if (a->lvec) { 510 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 511 } 512 513 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 514 a->spptr = new Mat_MPIAIJCUSPARSE; 515 516 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 517 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 518 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 519 cusparseStruct->coo_p = NULL; 520 cusparseStruct->coo_pw = NULL; 521 cusparseStruct->stream = 0; 522 cusparseStruct->deviceMat = NULL; 523 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 524 } 525 526 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 527 A->ops->mult = MatMult_MPIAIJCUSPARSE; 528 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 529 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 530 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 531 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 532 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 533 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 534 535 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 536 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 537 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 538 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 539 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 540 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 541 #if defined(PETSC_HAVE_HYPRE) 542 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 543 #endif 544 PetscFunctionReturn(0); 545 } 546 547 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 548 { 549 PetscErrorCode ierr; 550 551 PetscFunctionBegin; 552 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 553 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 554 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 555 PetscFunctionReturn(0); 556 } 557 558 /*@ 559 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 560 (the default parallel PETSc format). This matrix will ultimately pushed down 561 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 562 assembly performance the user should preallocate the matrix storage by setting 563 the parameter nz (or the array nnz). By setting these parameters accurately, 564 performance during matrix assembly can be increased by more than a factor of 50. 565 566 Collective 567 568 Input Parameters: 569 + comm - MPI communicator, set to PETSC_COMM_SELF 570 . m - number of rows 571 . n - number of columns 572 . nz - number of nonzeros per row (same for all rows) 573 - nnz - array containing the number of nonzeros in the various rows 574 (possibly different for each row) or NULL 575 576 Output Parameter: 577 . A - the matrix 578 579 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 580 MatXXXXSetPreallocation() paradigm instead of this routine directly. 581 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 582 583 Notes: 584 If nnz is given then nz is ignored 585 586 The AIJ format (also called the Yale sparse matrix format or 587 compressed row storage), is fully compatible with standard Fortran 77 588 storage. That is, the stored row and column indices can begin at 589 either one (as in Fortran) or zero. See the users' manual for details. 590 591 Specify the preallocated storage with either nz or nnz (not both). 592 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 593 allocation. For large problems you MUST preallocate memory or you 594 will get TERRIBLE performance, see the users' manual chapter on matrices. 595 596 By default, this format uses inodes (identical nodes) when possible, to 597 improve numerical efficiency of matrix-vector products and solves. We 598 search for consecutive rows with the same nonzero structure, thereby 599 reusing matrix information to achieve increased efficiency. 600 601 Level: intermediate 602 603 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 604 @*/ 605 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 606 { 607 PetscErrorCode ierr; 608 PetscMPIInt size; 609 610 PetscFunctionBegin; 611 ierr = MatCreate(comm,A);CHKERRQ(ierr); 612 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 613 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 614 if (size > 1) { 615 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 616 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 617 } else { 618 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 619 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 620 } 621 PetscFunctionReturn(0); 622 } 623 624 /*MC 625 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 626 627 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 628 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 629 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 630 631 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 632 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 633 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 634 for communicators controlling multiple processes. It is recommended that you call both of 635 the above preallocation routines for simplicity. 636 637 Options Database Keys: 638 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 639 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 640 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 641 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 642 643 Level: beginner 644 645 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 646 M*/ 647 648 /*MC 649 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 650 651 Level: beginner 652 653 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 654 M*/ 655 656 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 657 658 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 659 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 660 { 661 PetscSplitCSRDataStructure d_mat; 662 PetscMPIInt size; 663 PetscErrorCode ierr; 664 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 665 PetscScalar *aa = NULL,*ba = NULL; 666 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 667 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 668 CsrMatrix *matrixA = NULL,*matrixB = NULL; 669 670 PetscFunctionBegin; 671 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 672 if (A->factortype != MAT_FACTOR_NONE) { 673 *B = NULL; 674 PetscFunctionReturn(0); 675 } 676 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 677 // get jaca 678 if (size == 1) { 679 PetscBool isseqaij; 680 681 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 682 if (isseqaij) { 683 jaca = (Mat_SeqAIJ*)A->data; 684 if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 685 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 686 d_mat = cusparsestructA->deviceMat; 687 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 688 } else { 689 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 690 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 691 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 692 jaca = (Mat_SeqAIJ*)aij->A->data; 693 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 694 d_mat = spptr->deviceMat; 695 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 696 } 697 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 698 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 699 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 700 matrixA = (CsrMatrix*)matstruct->mat; 701 bi = NULL; 702 bj = NULL; 703 ba = NULL; 704 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 705 } else { 706 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 707 if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 708 jaca = (Mat_SeqAIJ*)aij->A->data; 709 jacb = (Mat_SeqAIJ*)aij->B->data; 710 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 711 712 if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 713 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 714 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 715 if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 716 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 717 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 718 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 719 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 720 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 721 if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 722 if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 723 matrixA = (CsrMatrix*)matstructA->mat; 724 matrixB = (CsrMatrix*)matstructB->mat; 725 if (jacb->compressedrow.use) { 726 if (!cusparsestructB->rowoffsets_gpu) { 727 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 728 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 729 } 730 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 731 } else { 732 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 733 } 734 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 735 ba = thrust::raw_pointer_cast(matrixB->values->data()); 736 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 737 d_mat = spptr->deviceMat; 738 } 739 if (jaca->compressedrow.use) { 740 if (!cusparsestructA->rowoffsets_gpu) { 741 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 742 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 743 } 744 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 745 } else { 746 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 747 } 748 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 749 aa = thrust::raw_pointer_cast(matrixA->values->data()); 750 751 if (!d_mat) { 752 cudaError_t cerr; 753 PetscSplitCSRDataStructure h_mat; 754 755 // create and populate strucy on host and copy on device 756 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 757 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 758 cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 759 if (size > 1) { /* need the colmap array */ 760 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 761 PetscInt *colmap; 762 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 763 764 if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 765 766 ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 767 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 768 #if defined(PETSC_USE_64BIT_INDICES) 769 { // have to make a long version of these 770 int *h_bi32, *h_bj32; 771 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 772 ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 773 cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 774 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 775 cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 776 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 777 778 cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 779 cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 780 cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 781 cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 782 783 h_mat->offdiag.i = d_bi64; 784 h_mat->offdiag.j = d_bj64; 785 h_mat->allocated_indices = PETSC_TRUE; 786 787 ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 788 } 789 #else 790 h_mat->offdiag.i = (PetscInt*)bi; 791 h_mat->offdiag.j = (PetscInt*)bj; 792 h_mat->allocated_indices = PETSC_FALSE; 793 #endif 794 h_mat->offdiag.a = ba; 795 h_mat->offdiag.n = A->rmap->n; 796 797 cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 798 cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 799 ierr = PetscFree(colmap);CHKERRQ(ierr); 800 } 801 h_mat->rstart = A->rmap->rstart; 802 h_mat->rend = A->rmap->rend; 803 h_mat->cstart = A->cmap->rstart; 804 h_mat->cend = A->cmap->rend; 805 h_mat->N = A->cmap->N; 806 #if defined(PETSC_USE_64BIT_INDICES) 807 { 808 if (sizeof(PetscInt) != 8) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 809 int *h_ai32, *h_aj32; 810 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 811 ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 812 cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 813 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 814 cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 815 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 816 817 cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 818 cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 819 cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 820 cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 821 822 h_mat->diag.i = d_ai64; 823 h_mat->diag.j = d_aj64; 824 h_mat->allocated_indices = PETSC_TRUE; 825 826 ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 827 } 828 #else 829 h_mat->diag.i = (PetscInt*)ai; 830 h_mat->diag.j = (PetscInt*)aj; 831 h_mat->allocated_indices = PETSC_FALSE; 832 #endif 833 h_mat->diag.a = aa; 834 h_mat->diag.n = A->rmap->n; 835 h_mat->rank = PetscGlobalRank; 836 // copy pointers and metadata to device 837 cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 838 ierr = PetscFree(h_mat);CHKERRQ(ierr); 839 } else { 840 ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 841 } 842 *B = d_mat; 843 A->offloadmask = PETSC_OFFLOAD_GPU; 844 PetscFunctionReturn(0); 845 } 846