1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 11 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 12 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 25 { 26 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 27 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 28 PetscInt n = cusp->coo_nd + cusp->coo_no; 29 PetscErrorCode ierr; 30 cudaError_t cerr; 31 32 PetscFunctionBegin; 33 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 34 if (cusp->coo_p) { 35 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 36 THRUSTARRAY w(v,v+n); 37 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->begin()), 38 cusp->coo_pw->begin())); 39 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->end()), 40 cusp->coo_pw->end())); 41 thrust::for_each(zibit,zieit,VecCUDAEquals()); 42 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 43 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 44 } else { 45 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 46 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 47 } 48 cerr = WaitForCUDA();CHKERRCUDA(cerr); 49 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 50 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 51 A->num_ass++; 52 A->assembled = PETSC_TRUE; 53 A->ass_nonzerostate = A->nonzerostate; 54 A->offloadmask = PETSC_OFFLOAD_BOTH; 55 PetscFunctionReturn(0); 56 } 57 58 template <typename Tuple> 59 struct IsNotOffDiagT 60 { 61 PetscInt _cstart,_cend; 62 63 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 64 __host__ __device__ 65 inline bool operator()(Tuple t) 66 { 67 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 68 } 69 }; 70 71 struct IsOffDiag 72 { 73 PetscInt _cstart,_cend; 74 75 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 76 __host__ __device__ 77 inline bool operator() (const PetscInt &c) 78 { 79 return c < _cstart || c >= _cend; 80 } 81 }; 82 83 struct GlobToLoc 84 { 85 PetscInt _start; 86 87 GlobToLoc(PetscInt start) : _start(start) {} 88 __host__ __device__ 89 inline PetscInt operator() (const PetscInt &c) 90 { 91 return c - _start; 92 } 93 }; 94 95 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 96 { 97 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 98 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 99 PetscErrorCode ierr; 100 PetscInt *jj; 101 size_t noff = 0; 102 THRUSTINTARRAY d_i(n); 103 THRUSTINTARRAY d_j(n); 104 ISLocalToGlobalMapping l2g; 105 cudaError_t cerr; 106 107 PetscFunctionBegin; 108 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 109 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 110 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 111 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 112 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 113 ierr = PetscFree(b->garray);CHKERRQ(ierr); 114 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 115 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 116 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 117 118 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 119 d_i.assign(coo_i,coo_i+n); 120 d_j.assign(coo_j,coo_j+n); 121 delete cusp->coo_p; 122 delete cusp->coo_pw; 123 cusp->coo_p = NULL; 124 cusp->coo_pw = NULL; 125 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 126 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 127 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 128 cusp->coo_p = new THRUSTINTARRAY(n); 129 cusp->coo_pw = new THRUSTARRAY(n); 130 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 131 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 132 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 133 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 134 firstoffd = mzipp.get_iterator_tuple().get<1>(); 135 } 136 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 137 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 138 139 /* from global to local */ 140 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 141 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 142 143 /* copy offdiag column indices to map on the CPU */ 144 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 145 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 146 auto o_j = d_j.begin(); 147 thrust::advance(o_j,cusp->coo_nd); 148 thrust::sort(thrust::device,o_j,d_j.end()); 149 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 150 noff = thrust::distance(o_j,wit); 151 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 152 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 153 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 154 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 155 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 156 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 157 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 158 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 159 160 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 161 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 162 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 163 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 164 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 165 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 166 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 167 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 168 169 /* GPU memory, cusparse specific call handles it internally */ 170 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 171 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 172 ierr = PetscFree(jj);CHKERRQ(ierr); 173 174 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 175 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 176 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 177 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 178 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 179 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 180 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 181 B->preallocated = PETSC_TRUE; 182 B->nonzerostate++; 183 cerr = WaitForCUDA();CHKERRCUDA(cerr); 184 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 185 186 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 187 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 188 B->offloadmask = PETSC_OFFLOAD_CPU; 189 B->assembled = PETSC_FALSE; 190 B->was_assembled = PETSC_FALSE; 191 PetscFunctionReturn(0); 192 } 193 194 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 195 { 196 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 197 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 198 PetscErrorCode ierr; 199 PetscInt i; 200 201 PetscFunctionBegin; 202 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 203 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 204 if (PetscDefined(USE_DEBUG) && d_nnz) { 205 for (i=0; i<B->rmap->n; i++) { 206 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 207 } 208 } 209 if (PetscDefined(USE_DEBUG) && o_nnz) { 210 for (i=0; i<B->rmap->n; i++) { 211 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 212 } 213 } 214 if (!B->preallocated) { 215 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 216 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 217 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 218 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 219 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 220 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 221 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 222 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 223 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 224 } 225 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 226 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 227 if (b->lvec) { 228 ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 229 } 230 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 231 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 232 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 233 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 234 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 235 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 236 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 237 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 238 239 B->preallocated = PETSC_TRUE; 240 PetscFunctionReturn(0); 241 } 242 243 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 244 { 245 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 246 PetscErrorCode ierr; 247 PetscInt nt; 248 249 PetscFunctionBegin; 250 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 251 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 252 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 253 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 254 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 255 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 260 { 261 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 262 PetscErrorCode ierr; 263 264 PetscFunctionBegin; 265 if (A->factortype == MAT_FACTOR_NONE) { 266 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 267 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 268 if (d_mat) { 269 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 270 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 271 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 272 cudaError_t err; 273 PetscScalar *vals; 274 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 275 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 276 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 277 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 278 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 279 } 280 } 281 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 282 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 283 284 PetscFunctionReturn(0); 285 } 286 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 287 { 288 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 289 PetscErrorCode ierr; 290 PetscInt nt; 291 292 PetscFunctionBegin; 293 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 294 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 295 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 296 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 297 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 298 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 299 PetscFunctionReturn(0); 300 } 301 302 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 303 { 304 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 305 PetscErrorCode ierr; 306 PetscInt nt; 307 308 PetscFunctionBegin; 309 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 310 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 311 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 312 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 313 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 314 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 315 PetscFunctionReturn(0); 316 } 317 318 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 319 { 320 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 321 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 322 323 PetscFunctionBegin; 324 switch (op) { 325 case MAT_CUSPARSE_MULT_DIAG: 326 cusparseStruct->diagGPUMatFormat = format; 327 break; 328 case MAT_CUSPARSE_MULT_OFFDIAG: 329 cusparseStruct->offdiagGPUMatFormat = format; 330 break; 331 case MAT_CUSPARSE_ALL: 332 cusparseStruct->diagGPUMatFormat = format; 333 cusparseStruct->offdiagGPUMatFormat = format; 334 break; 335 default: 336 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 337 } 338 PetscFunctionReturn(0); 339 } 340 341 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 342 { 343 MatCUSPARSEStorageFormat format; 344 PetscErrorCode ierr; 345 PetscBool flg; 346 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 347 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 348 349 PetscFunctionBegin; 350 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 351 if (A->factortype==MAT_FACTOR_NONE) { 352 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 353 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 354 if (flg) { 355 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 356 } 357 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 358 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 359 if (flg) { 360 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 361 } 362 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 363 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 364 if (flg) { 365 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 366 } 367 } 368 ierr = PetscOptionsTail();CHKERRQ(ierr); 369 PetscFunctionReturn(0); 370 } 371 372 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 373 { 374 PetscErrorCode ierr; 375 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 376 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 377 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 378 PetscFunctionBegin; 379 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 380 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 381 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 382 } 383 if (d_mat) { 384 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 385 } 386 387 PetscFunctionReturn(0); 388 } 389 390 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 391 { 392 PetscErrorCode ierr; 393 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 394 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 395 cudaError_t err; 396 cusparseStatus_t stat; 397 398 PetscFunctionBegin; 399 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 400 if (cusparseStruct->deviceMat) { 401 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 402 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 403 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 404 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 405 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 406 if (jaca->compressedrow.use) { 407 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 408 } 409 if (jacb->compressedrow.use) { 410 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 411 } 412 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 413 err = cudaFree(d_mat);CHKERRCUDA(err); 414 } 415 try { 416 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 417 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 418 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 419 if (cusparseStruct->stream) { 420 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 421 } 422 delete cusparseStruct->coo_p; 423 delete cusparseStruct->coo_pw; 424 delete cusparseStruct; 425 } catch(char *ex) { 426 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 427 } 428 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 429 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 430 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 431 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 432 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 437 { 438 PetscErrorCode ierr; 439 Mat_MPIAIJ *a; 440 Mat_MPIAIJCUSPARSE *cusparseStruct; 441 cusparseStatus_t stat; 442 Mat A; 443 444 PetscFunctionBegin; 445 if (reuse == MAT_INITIAL_MATRIX) { 446 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 447 } else if (reuse == MAT_REUSE_MATRIX) { 448 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 449 } 450 A = *newmat; 451 452 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 453 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 454 455 a = (Mat_MPIAIJ*)A->data; 456 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 457 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 458 if (a->lvec) { 459 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 460 } 461 462 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 463 a->spptr = new Mat_MPIAIJCUSPARSE; 464 465 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 466 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 467 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 468 cusparseStruct->coo_p = NULL; 469 cusparseStruct->coo_pw = NULL; 470 cusparseStruct->stream = 0; 471 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 472 cusparseStruct->deviceMat = NULL; 473 } 474 475 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 476 A->ops->mult = MatMult_MPIAIJCUSPARSE; 477 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 478 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 479 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 480 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 481 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 482 483 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 484 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 485 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 486 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 487 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 488 PetscFunctionReturn(0); 489 } 490 491 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 492 { 493 PetscErrorCode ierr; 494 495 PetscFunctionBegin; 496 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 497 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 498 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 499 PetscFunctionReturn(0); 500 } 501 502 /*@ 503 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 504 (the default parallel PETSc format). This matrix will ultimately pushed down 505 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 506 assembly performance the user should preallocate the matrix storage by setting 507 the parameter nz (or the array nnz). By setting these parameters accurately, 508 performance during matrix assembly can be increased by more than a factor of 50. 509 510 Collective 511 512 Input Parameters: 513 + comm - MPI communicator, set to PETSC_COMM_SELF 514 . m - number of rows 515 . n - number of columns 516 . nz - number of nonzeros per row (same for all rows) 517 - nnz - array containing the number of nonzeros in the various rows 518 (possibly different for each row) or NULL 519 520 Output Parameter: 521 . A - the matrix 522 523 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 524 MatXXXXSetPreallocation() paradigm instead of this routine directly. 525 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 526 527 Notes: 528 If nnz is given then nz is ignored 529 530 The AIJ format (also called the Yale sparse matrix format or 531 compressed row storage), is fully compatible with standard Fortran 77 532 storage. That is, the stored row and column indices can begin at 533 either one (as in Fortran) or zero. See the users' manual for details. 534 535 Specify the preallocated storage with either nz or nnz (not both). 536 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 537 allocation. For large problems you MUST preallocate memory or you 538 will get TERRIBLE performance, see the users' manual chapter on matrices. 539 540 By default, this format uses inodes (identical nodes) when possible, to 541 improve numerical efficiency of matrix-vector products and solves. We 542 search for consecutive rows with the same nonzero structure, thereby 543 reusing matrix information to achieve increased efficiency. 544 545 Level: intermediate 546 547 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 548 @*/ 549 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 550 { 551 PetscErrorCode ierr; 552 PetscMPIInt size; 553 554 PetscFunctionBegin; 555 ierr = MatCreate(comm,A);CHKERRQ(ierr); 556 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 557 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 558 if (size > 1) { 559 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 560 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 561 } else { 562 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 563 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 564 } 565 PetscFunctionReturn(0); 566 } 567 568 /*MC 569 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 570 571 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 572 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 573 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 574 575 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 576 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 577 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 578 for communicators controlling multiple processes. It is recommended that you call both of 579 the above preallocation routines for simplicity. 580 581 Options Database Keys: 582 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 583 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 584 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 585 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 586 587 Level: beginner 588 589 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 590 M 591 M*/ 592 593 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 594 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 595 { 596 #if defined(PETSC_USE_CTABLE) 597 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 598 #else 599 PetscSplitCSRDataStructure **p_d_mat; 600 PetscMPIInt size,rank; 601 MPI_Comm comm; 602 PetscErrorCode ierr; 603 int *ai,*bi,*aj,*bj; 604 PetscScalar *aa,*ba; 605 606 PetscFunctionBegin; 607 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 608 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 609 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 610 if (A->factortype == MAT_FACTOR_NONE) { 611 CsrMatrix *matrixA,*matrixB=NULL; 612 if (size == 1) { 613 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 614 p_d_mat = &cusparsestruct->deviceMat; 615 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 616 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 617 matrixA = (CsrMatrix*)matstruct->mat; 618 bi = bj = NULL; ba = NULL; 619 } else { 620 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 621 } 622 } else { 623 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 624 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 625 p_d_mat = &spptr->deviceMat; 626 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 627 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 628 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 629 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 630 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 631 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 632 matrixA = (CsrMatrix*)matstructA->mat; 633 matrixB = (CsrMatrix*)matstructB->mat; 634 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 635 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 636 ba = thrust::raw_pointer_cast(matrixB->values->data()); 637 if (rank==-1) { 638 for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++) 639 std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl; 640 for(unsigned int i = 0; i < matrixB->column_indices->size(); i++) 641 std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl; 642 for(unsigned int i = 0; i < matrixB->values->size(); i++) 643 std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl; 644 } 645 } else { 646 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 647 } 648 } 649 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 650 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 651 aa = thrust::raw_pointer_cast(matrixA->values->data()); 652 } else { 653 *B = NULL; 654 PetscFunctionReturn(0); 655 } 656 // act like MatSetValues because not called on host 657 if (A->assembled) { 658 if (A->was_assembled) { 659 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 660 } 661 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 662 } else { 663 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 664 } 665 if (!*p_d_mat) { 666 cudaError_t err; 667 PetscSplitCSRDataStructure *d_mat, h_mat; 668 Mat_SeqAIJ *jaca; 669 PetscInt n = A->rmap->n, nnz; 670 // create and copy 671 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 672 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 673 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 674 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 675 if (size == 1) { 676 jaca = (Mat_SeqAIJ*)A->data; 677 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 678 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 679 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 680 h_mat.offdiag.a = NULL; 681 } else { 682 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 683 Mat_SeqAIJ *jacb; 684 jaca = (Mat_SeqAIJ*)aij->A->data; 685 jacb = (Mat_SeqAIJ*)aij->B->data; 686 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 687 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 688 // create colmap - this is ussually done (lazy) in MatSetValues 689 aij->donotstash = PETSC_TRUE; 690 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 691 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 692 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 693 aij->colmap[A->cmap->N] = -9; 694 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 695 { 696 PetscInt ii; 697 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 698 } 699 if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 700 // allocate B copy data 701 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 702 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 703 nnz = jacb->i[n]; 704 705 if (jacb->compressedrow.use) { 706 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 707 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 708 } else { 709 h_mat.offdiag.i = bi; 710 } 711 h_mat.offdiag.j = bj; 712 h_mat.offdiag.a = ba; 713 714 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 715 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 716 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 717 h_mat.offdiag.n = n; 718 } 719 // allocate A copy data 720 nnz = jaca->i[n]; 721 h_mat.diag.n = n; 722 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 723 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 724 if (jaca->compressedrow.use) { 725 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 726 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 727 } else { 728 h_mat.diag.i = ai; 729 } 730 h_mat.diag.j = aj; 731 h_mat.diag.a = aa; 732 // copy pointers and metdata to device 733 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 734 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 735 } else { 736 *B = *p_d_mat; 737 } 738 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 739 PetscFunctionReturn(0); 740 #endif 741 } 742