1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 11 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 12 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 25 { 26 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 27 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 28 PetscInt n = cusp->coo_nd + cusp->coo_no; 29 PetscErrorCode ierr; 30 cudaError_t cerr; 31 32 PetscFunctionBegin; 33 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 34 if (cusp->coo_p) { 35 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 36 THRUSTARRAY w(v,v+n); 37 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->begin()), 38 cusp->coo_pw->begin())); 39 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->end()), 40 cusp->coo_pw->end())); 41 thrust::for_each(zibit,zieit,VecCUDAEquals()); 42 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 43 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 44 } else { 45 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 46 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 47 } 48 cerr = WaitForCUDA();CHKERRCUDA(cerr); 49 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 50 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 51 A->num_ass++; 52 A->assembled = PETSC_TRUE; 53 A->ass_nonzerostate = A->nonzerostate; 54 A->offloadmask = PETSC_OFFLOAD_BOTH; 55 PetscFunctionReturn(0); 56 } 57 58 template <typename Tuple> 59 struct IsNotOffDiagT 60 { 61 PetscInt _cstart,_cend; 62 63 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 64 __host__ __device__ 65 inline bool operator()(Tuple t) 66 { 67 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 68 } 69 }; 70 71 struct IsOffDiag 72 { 73 PetscInt _cstart,_cend; 74 75 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 76 __host__ __device__ 77 inline bool operator() (const PetscInt &c) 78 { 79 return c < _cstart || c >= _cend; 80 } 81 }; 82 83 struct GlobToLoc 84 { 85 PetscInt _start; 86 87 GlobToLoc(PetscInt start) : _start(start) {} 88 __host__ __device__ 89 inline PetscInt operator() (const PetscInt &c) 90 { 91 return c - _start; 92 } 93 }; 94 95 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 96 { 97 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 98 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 99 PetscErrorCode ierr; 100 PetscInt *jj; 101 size_t noff = 0; 102 THRUSTINTARRAY d_i(n); 103 THRUSTINTARRAY d_j(n); 104 ISLocalToGlobalMapping l2g; 105 cudaError_t cerr; 106 107 PetscFunctionBegin; 108 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 109 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 110 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 111 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 112 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 113 ierr = PetscFree(b->garray);CHKERRQ(ierr); 114 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 115 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 116 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 117 118 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 119 d_i.assign(coo_i,coo_i+n); 120 d_j.assign(coo_j,coo_j+n); 121 delete cusp->coo_p; 122 delete cusp->coo_pw; 123 cusp->coo_p = NULL; 124 cusp->coo_pw = NULL; 125 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 126 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 127 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 128 cusp->coo_p = new THRUSTINTARRAY(n); 129 cusp->coo_pw = new THRUSTARRAY(n); 130 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 131 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 132 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 133 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 134 firstoffd = mzipp.get_iterator_tuple().get<1>(); 135 } 136 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 137 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 138 139 /* from global to local */ 140 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 141 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 142 143 /* copy offdiag column indices to map on the CPU */ 144 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 145 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 146 auto o_j = d_j.begin(); 147 thrust::advance(o_j,cusp->coo_nd); 148 thrust::sort(thrust::device,o_j,d_j.end()); 149 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 150 noff = thrust::distance(o_j,wit); 151 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 152 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 153 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 154 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 155 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 156 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 157 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 158 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 159 160 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 161 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 162 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 163 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 164 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 165 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 166 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 167 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 168 169 /* GPU memory, cusparse specific call handles it internally */ 170 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 171 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 172 ierr = PetscFree(jj);CHKERRQ(ierr); 173 174 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 175 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 176 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 177 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 178 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 179 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 180 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 181 B->preallocated = PETSC_TRUE; 182 B->nonzerostate++; 183 cerr = WaitForCUDA();CHKERRCUDA(cerr); 184 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 185 186 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 187 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 188 B->offloadmask = PETSC_OFFLOAD_CPU; 189 B->assembled = PETSC_FALSE; 190 B->was_assembled = PETSC_FALSE; 191 PetscFunctionReturn(0); 192 } 193 194 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 195 { 196 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 197 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 198 PetscErrorCode ierr; 199 PetscInt i; 200 201 PetscFunctionBegin; 202 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 203 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 204 if (PetscDefined(USE_DEBUG) && d_nnz) { 205 for (i=0; i<B->rmap->n; i++) { 206 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 207 } 208 } 209 if (PetscDefined(USE_DEBUG) && o_nnz) { 210 for (i=0; i<B->rmap->n; i++) { 211 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 212 } 213 } 214 if (!B->preallocated) { 215 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 216 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 217 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 218 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 219 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 220 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 221 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 222 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 223 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 224 } 225 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 226 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 227 if (b->lvec) { 228 ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 229 } 230 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 231 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 232 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 233 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 234 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 235 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 236 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 237 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 238 239 B->preallocated = PETSC_TRUE; 240 PetscFunctionReturn(0); 241 } 242 243 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 244 { 245 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 246 PetscErrorCode ierr; 247 PetscInt nt; 248 249 PetscFunctionBegin; 250 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 251 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 252 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 253 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 254 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 255 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 260 { 261 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 262 PetscErrorCode ierr; 263 264 PetscFunctionBegin; 265 if (A->factortype == MAT_FACTOR_NONE) { 266 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 267 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 268 if (d_mat) { 269 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 270 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 271 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 272 cudaError_t err; 273 PetscScalar *vals; 274 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 275 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 276 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 277 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 278 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 279 } 280 } 281 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 282 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 283 284 PetscFunctionReturn(0); 285 } 286 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 287 { 288 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 289 PetscErrorCode ierr; 290 PetscInt nt; 291 292 PetscFunctionBegin; 293 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 294 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 295 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 296 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 297 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 298 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 299 PetscFunctionReturn(0); 300 } 301 302 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 303 { 304 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 305 PetscErrorCode ierr; 306 PetscInt nt; 307 308 PetscFunctionBegin; 309 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 310 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 311 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 312 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 313 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 314 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 315 PetscFunctionReturn(0); 316 } 317 318 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 319 { 320 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 321 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 322 323 PetscFunctionBegin; 324 switch (op) { 325 case MAT_CUSPARSE_MULT_DIAG: 326 cusparseStruct->diagGPUMatFormat = format; 327 break; 328 case MAT_CUSPARSE_MULT_OFFDIAG: 329 cusparseStruct->offdiagGPUMatFormat = format; 330 break; 331 case MAT_CUSPARSE_ALL: 332 cusparseStruct->diagGPUMatFormat = format; 333 cusparseStruct->offdiagGPUMatFormat = format; 334 break; 335 default: 336 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 337 } 338 PetscFunctionReturn(0); 339 } 340 341 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 342 { 343 MatCUSPARSEStorageFormat format; 344 PetscErrorCode ierr; 345 PetscBool flg; 346 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 347 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 348 349 PetscFunctionBegin; 350 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 351 if (A->factortype==MAT_FACTOR_NONE) { 352 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 353 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 354 if (flg) { 355 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 356 } 357 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 358 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 359 if (flg) { 360 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 361 } 362 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 363 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 364 if (flg) { 365 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 366 } 367 } 368 ierr = PetscOptionsTail();CHKERRQ(ierr); 369 PetscFunctionReturn(0); 370 } 371 372 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 373 { 374 PetscErrorCode ierr; 375 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 376 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 377 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 378 PetscInt nnz_state = A->nonzerostate; 379 PetscFunctionBegin; 380 if (d_mat) { 381 cudaError_t err; 382 err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 383 } 384 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 385 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 386 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 387 } 388 if (nnz_state > A->nonzerostate) { 389 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 390 } 391 392 PetscFunctionReturn(0); 393 } 394 395 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 396 { 397 PetscErrorCode ierr; 398 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 399 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 400 cudaError_t err; 401 cusparseStatus_t stat; 402 403 PetscFunctionBegin; 404 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 405 if (cusparseStruct->deviceMat) { 406 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 407 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 408 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 409 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 410 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 411 if (jaca->compressedrow.use) { 412 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 413 } 414 if (jacb->compressedrow.use) { 415 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 416 } 417 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 418 err = cudaFree(d_mat);CHKERRCUDA(err); 419 } 420 try { 421 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 422 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 423 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 424 if (cusparseStruct->stream) { 425 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 426 } 427 delete cusparseStruct->coo_p; 428 delete cusparseStruct->coo_pw; 429 delete cusparseStruct; 430 } catch(char *ex) { 431 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 432 } 433 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 434 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 435 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 436 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 437 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 438 PetscFunctionReturn(0); 439 } 440 441 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 442 { 443 PetscErrorCode ierr; 444 Mat_MPIAIJ *a; 445 Mat_MPIAIJCUSPARSE *cusparseStruct; 446 cusparseStatus_t stat; 447 Mat A; 448 449 PetscFunctionBegin; 450 if (reuse == MAT_INITIAL_MATRIX) { 451 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 452 } else if (reuse == MAT_REUSE_MATRIX) { 453 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 454 } 455 A = *newmat; 456 457 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 458 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 459 460 a = (Mat_MPIAIJ*)A->data; 461 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 462 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 463 if (a->lvec) { 464 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 465 } 466 467 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 468 a->spptr = new Mat_MPIAIJCUSPARSE; 469 470 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 471 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 472 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 473 cusparseStruct->coo_p = NULL; 474 cusparseStruct->coo_pw = NULL; 475 cusparseStruct->stream = 0; 476 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 477 cusparseStruct->deviceMat = NULL; 478 } 479 480 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 481 A->ops->mult = MatMult_MPIAIJCUSPARSE; 482 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 483 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 484 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 485 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 486 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 487 488 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 489 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 490 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 491 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 492 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 493 PetscFunctionReturn(0); 494 } 495 496 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 497 { 498 PetscErrorCode ierr; 499 500 PetscFunctionBegin; 501 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 502 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 503 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 504 PetscFunctionReturn(0); 505 } 506 507 /*@ 508 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 509 (the default parallel PETSc format). This matrix will ultimately pushed down 510 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 511 assembly performance the user should preallocate the matrix storage by setting 512 the parameter nz (or the array nnz). By setting these parameters accurately, 513 performance during matrix assembly can be increased by more than a factor of 50. 514 515 Collective 516 517 Input Parameters: 518 + comm - MPI communicator, set to PETSC_COMM_SELF 519 . m - number of rows 520 . n - number of columns 521 . nz - number of nonzeros per row (same for all rows) 522 - nnz - array containing the number of nonzeros in the various rows 523 (possibly different for each row) or NULL 524 525 Output Parameter: 526 . A - the matrix 527 528 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 529 MatXXXXSetPreallocation() paradigm instead of this routine directly. 530 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 531 532 Notes: 533 If nnz is given then nz is ignored 534 535 The AIJ format (also called the Yale sparse matrix format or 536 compressed row storage), is fully compatible with standard Fortran 77 537 storage. That is, the stored row and column indices can begin at 538 either one (as in Fortran) or zero. See the users' manual for details. 539 540 Specify the preallocated storage with either nz or nnz (not both). 541 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 542 allocation. For large problems you MUST preallocate memory or you 543 will get TERRIBLE performance, see the users' manual chapter on matrices. 544 545 By default, this format uses inodes (identical nodes) when possible, to 546 improve numerical efficiency of matrix-vector products and solves. We 547 search for consecutive rows with the same nonzero structure, thereby 548 reusing matrix information to achieve increased efficiency. 549 550 Level: intermediate 551 552 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 553 @*/ 554 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 555 { 556 PetscErrorCode ierr; 557 PetscMPIInt size; 558 559 PetscFunctionBegin; 560 ierr = MatCreate(comm,A);CHKERRQ(ierr); 561 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 562 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 563 if (size > 1) { 564 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 565 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 566 } else { 567 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 568 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 569 } 570 PetscFunctionReturn(0); 571 } 572 573 /*MC 574 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 575 576 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 577 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 578 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 579 580 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 581 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 582 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 583 for communicators controlling multiple processes. It is recommended that you call both of 584 the above preallocation routines for simplicity. 585 586 Options Database Keys: 587 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 588 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 589 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 590 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 591 592 Level: beginner 593 594 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 595 M 596 M*/ 597 598 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 599 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 600 { 601 #if defined(PETSC_USE_CTABLE) 602 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 603 #else 604 PetscSplitCSRDataStructure **p_d_mat; 605 PetscMPIInt size,rank; 606 MPI_Comm comm; 607 PetscErrorCode ierr; 608 int *ai,*bi,*aj,*bj; 609 PetscScalar *aa,*ba; 610 611 PetscFunctionBegin; 612 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 613 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 614 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 615 if (A->factortype == MAT_FACTOR_NONE) { 616 CsrMatrix *matrixA,*matrixB=NULL; 617 if (size == 1) { 618 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 619 p_d_mat = &cusparsestruct->deviceMat; 620 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 621 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 622 matrixA = (CsrMatrix*)matstruct->mat; 623 bi = bj = NULL; ba = NULL; 624 } else { 625 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 626 } 627 } else { 628 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 629 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 630 p_d_mat = &spptr->deviceMat; 631 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 632 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 633 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 634 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 635 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 636 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 637 matrixA = (CsrMatrix*)matstructA->mat; 638 matrixB = (CsrMatrix*)matstructB->mat; 639 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 640 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 641 ba = thrust::raw_pointer_cast(matrixB->values->data()); 642 if (rank==-1) { 643 for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++) 644 std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl; 645 for(unsigned int i = 0; i < matrixB->column_indices->size(); i++) 646 std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl; 647 for(unsigned int i = 0; i < matrixB->values->size(); i++) 648 std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl; 649 } 650 } else { 651 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 652 } 653 } 654 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 655 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 656 aa = thrust::raw_pointer_cast(matrixA->values->data()); 657 } else { 658 *B = NULL; 659 PetscFunctionReturn(0); 660 } 661 // act like MatSetValues because not called on host 662 if (A->assembled) { 663 if (A->was_assembled) { 664 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 665 } 666 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 667 } else { 668 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 669 } 670 if (!*p_d_mat) { 671 cudaError_t err; 672 PetscSplitCSRDataStructure *d_mat, h_mat; 673 Mat_SeqAIJ *jaca; 674 PetscInt n = A->rmap->n, nnz; 675 // create and copy 676 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 677 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 678 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 679 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 680 if (size == 1) { 681 jaca = (Mat_SeqAIJ*)A->data; 682 h_mat.nonzerostate = A->nonzerostate; 683 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 684 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 685 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 686 h_mat.offdiag.a = NULL; 687 h_mat.seq = PETSC_TRUE; 688 } else { 689 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 690 Mat_SeqAIJ *jacb; 691 h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE 692 jaca = (Mat_SeqAIJ*)aij->A->data; 693 jacb = (Mat_SeqAIJ*)aij->B->data; 694 h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state? 695 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 696 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 697 // create colmap - this is ussually done (lazy) in MatSetValues 698 aij->donotstash = PETSC_TRUE; 699 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 700 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 701 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 702 aij->colmap[A->cmap->N] = -9; 703 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 704 { 705 PetscInt ii; 706 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 707 } 708 if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 709 // allocate B copy data 710 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 711 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 712 nnz = jacb->i[n]; 713 714 if (jacb->compressedrow.use) { 715 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 716 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 717 } else { 718 h_mat.offdiag.i = bi; 719 } 720 h_mat.offdiag.j = bj; 721 h_mat.offdiag.a = ba; 722 723 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 724 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 725 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 726 h_mat.offdiag.n = n; 727 } 728 // allocate A copy data 729 nnz = jaca->i[n]; 730 h_mat.diag.n = n; 731 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 732 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 733 if (jaca->compressedrow.use) { 734 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 735 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 736 } else { 737 h_mat.diag.i = ai; 738 } 739 h_mat.diag.j = aj; 740 h_mat.diag.a = aa; 741 // copy pointers and metdata to device 742 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 743 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 744 } else { 745 *B = *p_d_mat; 746 } 747 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 748 PetscFunctionReturn(0); 749 #endif 750 } 751