1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 11 struct VecCUDAEquals 12 { 13 template <typename Tuple> 14 __host__ __device__ 15 void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 22 { 23 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 24 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 25 PetscInt n = cusp->coo_nd + cusp->coo_no; 26 PetscErrorCode ierr; 27 cudaError_t cerr; 28 29 PetscFunctionBegin; 30 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 31 if (cusp->coo_p && v) { 32 THRUSTARRAY *w; 33 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 34 w = new THRUSTARRAY(n); 35 w->assign(v,v+n); 36 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w->begin(),cusp->coo_p->begin()), 37 cusp->coo_pw->begin())); 38 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w->begin(),cusp->coo_p->end()), 39 cusp->coo_pw->end())); 40 thrust::for_each(zibit,zieit,VecCUDAEquals()); 41 delete w; 42 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 43 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 44 } else { 45 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 46 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 47 } 48 cerr = WaitForCUDA();CHKERRCUDA(cerr); 49 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 50 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 51 A->num_ass++; 52 A->assembled = PETSC_TRUE; 53 A->ass_nonzerostate = A->nonzerostate; 54 A->offloadmask = PETSC_OFFLOAD_BOTH; 55 PetscFunctionReturn(0); 56 } 57 58 template <typename Tuple> 59 struct IsNotOffDiagT 60 { 61 PetscInt _cstart,_cend; 62 63 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 64 __host__ __device__ 65 inline bool operator()(Tuple t) 66 { 67 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 68 } 69 }; 70 71 struct IsOffDiag 72 { 73 PetscInt _cstart,_cend; 74 75 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 76 __host__ __device__ 77 inline bool operator() (const PetscInt &c) 78 { 79 return c < _cstart || c >= _cend; 80 } 81 }; 82 83 struct GlobToLoc 84 { 85 PetscInt _start; 86 87 GlobToLoc(PetscInt start) : _start(start) {} 88 __host__ __device__ 89 inline PetscInt operator() (const PetscInt &c) 90 { 91 return c - _start; 92 } 93 }; 94 95 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 96 { 97 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 98 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 99 PetscErrorCode ierr; 100 PetscInt *jj; 101 size_t noff = 0; 102 THRUSTINTARRAY d_i(n); 103 THRUSTINTARRAY d_j(n); 104 ISLocalToGlobalMapping l2g; 105 cudaError_t cerr; 106 107 PetscFunctionBegin; 108 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 109 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 110 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 111 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 112 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 113 ierr = PetscFree(b->garray);CHKERRQ(ierr); 114 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 115 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 116 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 117 118 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 119 d_i.assign(coo_i,coo_i+n); 120 d_j.assign(coo_j,coo_j+n); 121 delete cusp->coo_p; 122 delete cusp->coo_pw; 123 cusp->coo_p = NULL; 124 cusp->coo_pw = NULL; 125 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 126 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 127 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 128 cusp->coo_p = new THRUSTINTARRAY(n); 129 cusp->coo_pw = new THRUSTARRAY(n); 130 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 131 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 132 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 133 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 134 firstoffd = mzipp.get_iterator_tuple().get<1>(); 135 } 136 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 137 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 138 139 /* from global to local */ 140 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 141 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 142 143 /* copy offdiag column indices to map on the CPU */ 144 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 145 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 146 auto o_j = d_j.begin(); 147 thrust::advance(o_j,cusp->coo_nd); 148 thrust::sort(thrust::device,o_j,d_j.end()); 149 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 150 noff = thrust::distance(o_j,wit); 151 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 152 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 153 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 154 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 155 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 156 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 157 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 158 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 159 160 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 161 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 162 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 163 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 164 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 165 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 166 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 167 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 168 169 /* GPU memory, cusparse specific call handles it internally */ 170 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 171 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 172 ierr = PetscFree(jj);CHKERRQ(ierr); 173 174 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 175 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 176 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 177 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 178 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 179 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 180 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 181 B->preallocated = PETSC_TRUE; 182 B->nonzerostate++; 183 cerr = WaitForCUDA();CHKERRCUDA(cerr); 184 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 185 186 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 187 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 188 B->offloadmask = PETSC_OFFLOAD_CPU; 189 B->assembled = PETSC_FALSE; 190 B->was_assembled = PETSC_FALSE; 191 PetscFunctionReturn(0); 192 } 193 194 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 195 { 196 Mat Ad,Ao; 197 const PetscInt *cmap; 198 PetscErrorCode ierr; 199 200 PetscFunctionBegin; 201 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 202 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 203 if (glob) { 204 PetscInt cst, i, dn, on, *gidx; 205 206 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 207 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 208 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 209 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 210 for (i=0; i<dn; i++) gidx[i] = cst + i; 211 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 212 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 213 } 214 PetscFunctionReturn(0); 215 } 216 217 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 218 { 219 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 220 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 221 PetscErrorCode ierr; 222 PetscInt i; 223 224 PetscFunctionBegin; 225 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 226 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 227 if (PetscDefined(USE_DEBUG) && d_nnz) { 228 for (i=0; i<B->rmap->n; i++) { 229 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 230 } 231 } 232 if (PetscDefined(USE_DEBUG) && o_nnz) { 233 for (i=0; i<B->rmap->n; i++) { 234 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 235 } 236 } 237 if (!B->preallocated) { 238 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 239 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 240 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 241 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 242 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 243 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 244 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 245 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 246 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 247 } 248 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 249 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 250 if (b->lvec) { 251 ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 252 } 253 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 254 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 255 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 256 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 257 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 258 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 259 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 260 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 261 262 B->preallocated = PETSC_TRUE; 263 PetscFunctionReturn(0); 264 } 265 266 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 267 { 268 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 269 PetscErrorCode ierr; 270 PetscInt nt; 271 272 PetscFunctionBegin; 273 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 274 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 275 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 276 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 277 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 278 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 279 PetscFunctionReturn(0); 280 } 281 282 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 283 { 284 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 285 PetscErrorCode ierr; 286 287 PetscFunctionBegin; 288 if (A->factortype == MAT_FACTOR_NONE) { 289 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 290 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 291 if (d_mat) { 292 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 293 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 294 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 295 cudaError_t err; 296 PetscScalar *vals; 297 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 298 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 299 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 300 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 301 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 302 } 303 } 304 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 305 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 306 307 PetscFunctionReturn(0); 308 } 309 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 310 { 311 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 312 PetscErrorCode ierr; 313 PetscInt nt; 314 315 PetscFunctionBegin; 316 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 317 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 318 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 319 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 320 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 321 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 322 PetscFunctionReturn(0); 323 } 324 325 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 326 { 327 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 328 PetscErrorCode ierr; 329 PetscInt nt; 330 331 PetscFunctionBegin; 332 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 333 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 334 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 335 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 336 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 337 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 338 PetscFunctionReturn(0); 339 } 340 341 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 342 { 343 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 344 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 345 346 PetscFunctionBegin; 347 switch (op) { 348 case MAT_CUSPARSE_MULT_DIAG: 349 cusparseStruct->diagGPUMatFormat = format; 350 break; 351 case MAT_CUSPARSE_MULT_OFFDIAG: 352 cusparseStruct->offdiagGPUMatFormat = format; 353 break; 354 case MAT_CUSPARSE_ALL: 355 cusparseStruct->diagGPUMatFormat = format; 356 cusparseStruct->offdiagGPUMatFormat = format; 357 break; 358 default: 359 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 360 } 361 PetscFunctionReturn(0); 362 } 363 364 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 365 { 366 MatCUSPARSEStorageFormat format; 367 PetscErrorCode ierr; 368 PetscBool flg; 369 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 370 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 371 372 PetscFunctionBegin; 373 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 374 if (A->factortype==MAT_FACTOR_NONE) { 375 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 376 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 377 if (flg) { 378 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 379 } 380 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 381 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 382 if (flg) { 383 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 384 } 385 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 386 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 387 if (flg) { 388 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 389 } 390 } 391 ierr = PetscOptionsTail();CHKERRQ(ierr); 392 PetscFunctionReturn(0); 393 } 394 395 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 396 { 397 PetscErrorCode ierr; 398 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 399 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 400 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 401 PetscFunctionBegin; 402 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 403 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 404 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 405 } 406 if (d_mat) { 407 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 408 } 409 410 PetscFunctionReturn(0); 411 } 412 413 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 414 { 415 PetscErrorCode ierr; 416 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 417 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 418 cudaError_t err; 419 cusparseStatus_t stat; 420 421 PetscFunctionBegin; 422 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 423 if (cusparseStruct->deviceMat) { 424 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 425 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 426 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 427 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 428 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 429 if (jaca->compressedrow.use) { 430 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 431 } 432 if (jacb->compressedrow.use) { 433 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 434 } 435 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 436 err = cudaFree(d_mat);CHKERRCUDA(err); 437 } 438 try { 439 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 440 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 441 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 442 if (cusparseStruct->stream) { 443 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 444 } 445 delete cusparseStruct->coo_p; 446 delete cusparseStruct->coo_pw; 447 delete cusparseStruct; 448 } catch(char *ex) { 449 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 450 } 451 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 452 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 453 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 454 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 455 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 456 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 457 PetscFunctionReturn(0); 458 } 459 460 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 461 { 462 PetscErrorCode ierr; 463 Mat_MPIAIJ *a; 464 Mat_MPIAIJCUSPARSE *cusparseStruct; 465 cusparseStatus_t stat; 466 Mat A; 467 468 PetscFunctionBegin; 469 if (reuse == MAT_INITIAL_MATRIX) { 470 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 471 } else if (reuse == MAT_REUSE_MATRIX) { 472 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 473 } 474 A = *newmat; 475 476 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 477 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 478 479 a = (Mat_MPIAIJ*)A->data; 480 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 481 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 482 if (a->lvec) { 483 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 484 } 485 486 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 487 a->spptr = new Mat_MPIAIJCUSPARSE; 488 489 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 490 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 491 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 492 cusparseStruct->coo_p = NULL; 493 cusparseStruct->coo_pw = NULL; 494 cusparseStruct->stream = 0; 495 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 496 cusparseStruct->deviceMat = NULL; 497 } 498 499 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 500 A->ops->mult = MatMult_MPIAIJCUSPARSE; 501 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 502 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 503 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 504 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 505 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 506 507 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 508 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 509 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 510 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 511 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 512 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 513 PetscFunctionReturn(0); 514 } 515 516 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 517 { 518 PetscErrorCode ierr; 519 520 PetscFunctionBegin; 521 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 522 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 523 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 524 PetscFunctionReturn(0); 525 } 526 527 /*@ 528 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 529 (the default parallel PETSc format). This matrix will ultimately pushed down 530 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 531 assembly performance the user should preallocate the matrix storage by setting 532 the parameter nz (or the array nnz). By setting these parameters accurately, 533 performance during matrix assembly can be increased by more than a factor of 50. 534 535 Collective 536 537 Input Parameters: 538 + comm - MPI communicator, set to PETSC_COMM_SELF 539 . m - number of rows 540 . n - number of columns 541 . nz - number of nonzeros per row (same for all rows) 542 - nnz - array containing the number of nonzeros in the various rows 543 (possibly different for each row) or NULL 544 545 Output Parameter: 546 . A - the matrix 547 548 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 549 MatXXXXSetPreallocation() paradigm instead of this routine directly. 550 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 551 552 Notes: 553 If nnz is given then nz is ignored 554 555 The AIJ format (also called the Yale sparse matrix format or 556 compressed row storage), is fully compatible with standard Fortran 77 557 storage. That is, the stored row and column indices can begin at 558 either one (as in Fortran) or zero. See the users' manual for details. 559 560 Specify the preallocated storage with either nz or nnz (not both). 561 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 562 allocation. For large problems you MUST preallocate memory or you 563 will get TERRIBLE performance, see the users' manual chapter on matrices. 564 565 By default, this format uses inodes (identical nodes) when possible, to 566 improve numerical efficiency of matrix-vector products and solves. We 567 search for consecutive rows with the same nonzero structure, thereby 568 reusing matrix information to achieve increased efficiency. 569 570 Level: intermediate 571 572 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 573 @*/ 574 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 575 { 576 PetscErrorCode ierr; 577 PetscMPIInt size; 578 579 PetscFunctionBegin; 580 ierr = MatCreate(comm,A);CHKERRQ(ierr); 581 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 582 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 583 if (size > 1) { 584 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 585 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 586 } else { 587 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 588 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 589 } 590 PetscFunctionReturn(0); 591 } 592 593 /*MC 594 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 595 596 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 597 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 598 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 599 600 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 601 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 602 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 603 for communicators controlling multiple processes. It is recommended that you call both of 604 the above preallocation routines for simplicity. 605 606 Options Database Keys: 607 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 608 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 609 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 610 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 611 612 Level: beginner 613 614 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 615 M 616 M*/ 617 618 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 619 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 620 { 621 #if defined(PETSC_USE_CTABLE) 622 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 623 #else 624 PetscSplitCSRDataStructure **p_d_mat; 625 PetscMPIInt size,rank; 626 MPI_Comm comm; 627 PetscErrorCode ierr; 628 int *ai,*bi,*aj,*bj; 629 PetscScalar *aa,*ba; 630 631 PetscFunctionBegin; 632 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 633 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 634 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 635 if (A->factortype == MAT_FACTOR_NONE) { 636 CsrMatrix *matrixA,*matrixB=NULL; 637 if (size == 1) { 638 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 639 p_d_mat = &cusparsestruct->deviceMat; 640 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 641 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 642 matrixA = (CsrMatrix*)matstruct->mat; 643 bi = bj = NULL; ba = NULL; 644 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 645 } else { 646 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 647 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 648 p_d_mat = &spptr->deviceMat; 649 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 650 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 651 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 652 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 653 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 654 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 655 matrixA = (CsrMatrix*)matstructA->mat; 656 matrixB = (CsrMatrix*)matstructB->mat; 657 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 658 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 659 ba = thrust::raw_pointer_cast(matrixB->values->data()); 660 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 661 } 662 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 663 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 664 aa = thrust::raw_pointer_cast(matrixA->values->data()); 665 } else { 666 *B = NULL; 667 PetscFunctionReturn(0); 668 } 669 // act like MatSetValues because not called on host 670 if (A->assembled) { 671 if (A->was_assembled) { 672 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 673 } 674 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 675 } else { 676 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 677 } 678 if (!*p_d_mat) { 679 cudaError_t err; 680 PetscSplitCSRDataStructure *d_mat, h_mat; 681 Mat_SeqAIJ *jaca; 682 PetscInt n = A->rmap->n, nnz; 683 // create and copy 684 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 685 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 686 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 687 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 688 if (size == 1) { 689 jaca = (Mat_SeqAIJ*)A->data; 690 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 691 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 692 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 693 h_mat.offdiag.a = NULL; 694 } else { 695 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 696 Mat_SeqAIJ *jacb; 697 jaca = (Mat_SeqAIJ*)aij->A->data; 698 jacb = (Mat_SeqAIJ*)aij->B->data; 699 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 700 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 701 // create colmap - this is ussually done (lazy) in MatSetValues 702 aij->donotstash = PETSC_TRUE; 703 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 704 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 705 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 706 aij->colmap[A->cmap->N] = -9; 707 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 708 { 709 PetscInt ii; 710 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 711 } 712 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 713 // allocate B copy data 714 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 715 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 716 nnz = jacb->i[n]; 717 718 if (jacb->compressedrow.use) { 719 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 720 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 721 } else h_mat.offdiag.i = bi; 722 h_mat.offdiag.j = bj; 723 h_mat.offdiag.a = ba; 724 725 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 726 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 727 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 728 h_mat.offdiag.n = n; 729 } 730 // allocate A copy data 731 nnz = jaca->i[n]; 732 h_mat.diag.n = n; 733 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 734 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 735 if (jaca->compressedrow.use) { 736 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 737 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 738 } else { 739 h_mat.diag.i = ai; 740 } 741 h_mat.diag.j = aj; 742 h_mat.diag.a = aa; 743 // copy pointers and metdata to device 744 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 745 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 746 } else { 747 *B = *p_d_mat; 748 } 749 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 750 PetscFunctionReturn(0); 751 #endif 752 } 753