1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 11 struct VecCUDAEquals 12 { 13 template <typename Tuple> 14 __host__ __device__ 15 void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 22 { 23 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 24 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 25 PetscInt n = cusp->coo_nd + cusp->coo_no; 26 PetscErrorCode ierr; 27 cudaError_t cerr; 28 29 PetscFunctionBegin; 30 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 31 if (cusp->coo_p && v) { 32 thrust::device_ptr<const PetscScalar> d_v; 33 THRUSTARRAY *w = NULL; 34 35 if (isCudaMem(v)) { 36 d_v = thrust::device_pointer_cast(v); 37 } else { 38 w = new THRUSTARRAY(n); 39 w->assign(v,v+n); 40 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 41 d_v = w->data(); 42 } 43 44 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 45 cusp->coo_pw->begin())); 46 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 47 cusp->coo_pw->end())); 48 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 49 thrust::for_each(zibit,zieit,VecCUDAEquals()); 50 cerr = WaitForCUDA();CHKERRCUDA(cerr); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 60 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 61 A->num_ass++; 62 A->assembled = PETSC_TRUE; 63 A->ass_nonzerostate = A->nonzerostate; 64 A->offloadmask = PETSC_OFFLOAD_BOTH; 65 PetscFunctionReturn(0); 66 } 67 68 template <typename Tuple> 69 struct IsNotOffDiagT 70 { 71 PetscInt _cstart,_cend; 72 73 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 74 __host__ __device__ 75 inline bool operator()(Tuple t) 76 { 77 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 78 } 79 }; 80 81 struct IsOffDiag 82 { 83 PetscInt _cstart,_cend; 84 85 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 86 __host__ __device__ 87 inline bool operator() (const PetscInt &c) 88 { 89 return c < _cstart || c >= _cend; 90 } 91 }; 92 93 struct GlobToLoc 94 { 95 PetscInt _start; 96 97 GlobToLoc(PetscInt start) : _start(start) {} 98 __host__ __device__ 99 inline PetscInt operator() (const PetscInt &c) 100 { 101 return c - _start; 102 } 103 }; 104 105 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 106 { 107 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 108 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 109 PetscErrorCode ierr; 110 PetscInt *jj; 111 size_t noff = 0; 112 THRUSTINTARRAY d_i(n); 113 THRUSTINTARRAY d_j(n); 114 ISLocalToGlobalMapping l2g; 115 cudaError_t cerr; 116 117 PetscFunctionBegin; 118 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 119 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 120 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 121 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 122 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 123 ierr = PetscFree(b->garray);CHKERRQ(ierr); 124 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 125 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 126 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 127 128 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 129 d_i.assign(coo_i,coo_i+n); 130 d_j.assign(coo_j,coo_j+n); 131 delete cusp->coo_p; 132 delete cusp->coo_pw; 133 cusp->coo_p = NULL; 134 cusp->coo_pw = NULL; 135 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 136 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 137 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 138 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 139 cusp->coo_p = new THRUSTINTARRAY(n); 140 cusp->coo_pw = new THRUSTARRAY(n); 141 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 142 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 143 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 144 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 145 firstoffd = mzipp.get_iterator_tuple().get<1>(); 146 } 147 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 148 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 149 150 /* from global to local */ 151 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 152 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 153 cerr = WaitForCUDA();CHKERRCUDA(cerr); 154 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 155 156 /* copy offdiag column indices to map on the CPU */ 157 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 158 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 159 auto o_j = d_j.begin(); 160 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 161 thrust::advance(o_j,cusp->coo_nd); 162 thrust::sort(thrust::device,o_j,d_j.end()); 163 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 164 cerr = WaitForCUDA();CHKERRCUDA(cerr); 165 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 166 noff = thrust::distance(o_j,wit); 167 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 168 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 169 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 170 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 171 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 172 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 173 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 174 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 175 176 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 177 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 178 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 179 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 180 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 181 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 182 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 183 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 184 185 /* GPU memory, cusparse specific call handles it internally */ 186 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 187 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 188 ierr = PetscFree(jj);CHKERRQ(ierr); 189 190 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 191 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 192 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 193 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 194 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 195 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 196 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 197 B->preallocated = PETSC_TRUE; 198 B->nonzerostate++; 199 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 200 201 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 202 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 203 B->offloadmask = PETSC_OFFLOAD_CPU; 204 B->assembled = PETSC_FALSE; 205 B->was_assembled = PETSC_FALSE; 206 PetscFunctionReturn(0); 207 } 208 209 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 210 { 211 Mat Ad,Ao; 212 const PetscInt *cmap; 213 PetscErrorCode ierr; 214 215 PetscFunctionBegin; 216 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 217 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 218 if (glob) { 219 PetscInt cst, i, dn, on, *gidx; 220 221 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 222 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 223 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 224 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 225 for (i=0; i<dn; i++) gidx[i] = cst + i; 226 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 227 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 228 } 229 PetscFunctionReturn(0); 230 } 231 232 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 233 { 234 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 235 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 236 PetscErrorCode ierr; 237 PetscInt i; 238 239 PetscFunctionBegin; 240 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 241 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 242 if (PetscDefined(USE_DEBUG) && d_nnz) { 243 for (i=0; i<B->rmap->n; i++) { 244 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 245 } 246 } 247 if (PetscDefined(USE_DEBUG) && o_nnz) { 248 for (i=0; i<B->rmap->n; i++) { 249 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 250 } 251 } 252 if (!B->preallocated) { 253 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 254 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 255 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 256 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 257 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 258 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 259 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 260 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 261 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 262 } 263 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 264 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 265 if (b->lvec) { 266 ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 267 } 268 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 269 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 270 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 271 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 272 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 273 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 274 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 275 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 276 277 B->preallocated = PETSC_TRUE; 278 PetscFunctionReturn(0); 279 } 280 281 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 282 { 283 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 284 PetscErrorCode ierr; 285 PetscInt nt; 286 287 PetscFunctionBegin; 288 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 289 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 290 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 291 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 292 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 293 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 294 PetscFunctionReturn(0); 295 } 296 297 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 298 { 299 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 300 PetscErrorCode ierr; 301 302 PetscFunctionBegin; 303 if (A->factortype == MAT_FACTOR_NONE) { 304 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 305 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 306 if (d_mat) { 307 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 308 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 309 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 310 cudaError_t err; 311 PetscScalar *vals; 312 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 313 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 314 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 315 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 316 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 317 } 318 } 319 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 320 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 321 322 PetscFunctionReturn(0); 323 } 324 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 325 { 326 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 327 PetscErrorCode ierr; 328 PetscInt nt; 329 330 PetscFunctionBegin; 331 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 332 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 333 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 334 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 335 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 336 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 337 PetscFunctionReturn(0); 338 } 339 340 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 341 { 342 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 343 PetscErrorCode ierr; 344 PetscInt nt; 345 346 PetscFunctionBegin; 347 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 348 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 349 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 350 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 351 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 352 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 353 PetscFunctionReturn(0); 354 } 355 356 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 357 { 358 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 359 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 360 361 PetscFunctionBegin; 362 switch (op) { 363 case MAT_CUSPARSE_MULT_DIAG: 364 cusparseStruct->diagGPUMatFormat = format; 365 break; 366 case MAT_CUSPARSE_MULT_OFFDIAG: 367 cusparseStruct->offdiagGPUMatFormat = format; 368 break; 369 case MAT_CUSPARSE_ALL: 370 cusparseStruct->diagGPUMatFormat = format; 371 cusparseStruct->offdiagGPUMatFormat = format; 372 break; 373 default: 374 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 375 } 376 PetscFunctionReturn(0); 377 } 378 379 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 380 { 381 MatCUSPARSEStorageFormat format; 382 PetscErrorCode ierr; 383 PetscBool flg; 384 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 385 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 386 387 PetscFunctionBegin; 388 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 389 if (A->factortype==MAT_FACTOR_NONE) { 390 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 391 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 392 if (flg) { 393 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 394 } 395 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 396 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 397 if (flg) { 398 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 399 } 400 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 401 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 402 if (flg) { 403 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 404 } 405 } 406 ierr = PetscOptionsTail();CHKERRQ(ierr); 407 PetscFunctionReturn(0); 408 } 409 410 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 411 { 412 PetscErrorCode ierr; 413 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 414 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 415 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 416 PetscFunctionBegin; 417 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 418 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 419 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 420 } 421 if (d_mat) { 422 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 423 } 424 425 PetscFunctionReturn(0); 426 } 427 428 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 429 { 430 PetscErrorCode ierr; 431 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 432 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 433 cudaError_t err; 434 cusparseStatus_t stat; 435 436 PetscFunctionBegin; 437 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 438 if (cusparseStruct->deviceMat) { 439 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 440 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 441 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 442 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 443 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 444 if (jaca->compressedrow.use) { 445 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 446 } 447 if (jacb->compressedrow.use) { 448 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 449 } 450 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 451 err = cudaFree(d_mat);CHKERRCUDA(err); 452 } 453 try { 454 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 455 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 456 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 457 if (cusparseStruct->stream) { 458 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 459 } 460 delete cusparseStruct->coo_p; 461 delete cusparseStruct->coo_pw; 462 delete cusparseStruct; 463 } catch(char *ex) { 464 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 465 } 466 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 467 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 468 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 469 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 470 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 471 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 472 PetscFunctionReturn(0); 473 } 474 475 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 476 { 477 PetscErrorCode ierr; 478 Mat_MPIAIJ *a; 479 Mat_MPIAIJCUSPARSE *cusparseStruct; 480 cusparseStatus_t stat; 481 Mat A; 482 483 PetscFunctionBegin; 484 if (reuse == MAT_INITIAL_MATRIX) { 485 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 486 } else if (reuse == MAT_REUSE_MATRIX) { 487 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 488 } 489 A = *newmat; 490 491 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 492 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 493 494 a = (Mat_MPIAIJ*)A->data; 495 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 496 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 497 if (a->lvec) { 498 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 499 } 500 501 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 502 a->spptr = new Mat_MPIAIJCUSPARSE; 503 504 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 505 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 506 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 507 cusparseStruct->coo_p = NULL; 508 cusparseStruct->coo_pw = NULL; 509 cusparseStruct->stream = 0; 510 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 511 cusparseStruct->deviceMat = NULL; 512 } 513 514 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 515 A->ops->mult = MatMult_MPIAIJCUSPARSE; 516 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 517 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 518 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 519 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 520 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 521 522 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 523 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 524 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 525 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 526 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 527 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 528 PetscFunctionReturn(0); 529 } 530 531 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 532 { 533 PetscErrorCode ierr; 534 535 PetscFunctionBegin; 536 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 537 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 538 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 539 PetscFunctionReturn(0); 540 } 541 542 /*@ 543 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 544 (the default parallel PETSc format). This matrix will ultimately pushed down 545 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 546 assembly performance the user should preallocate the matrix storage by setting 547 the parameter nz (or the array nnz). By setting these parameters accurately, 548 performance during matrix assembly can be increased by more than a factor of 50. 549 550 Collective 551 552 Input Parameters: 553 + comm - MPI communicator, set to PETSC_COMM_SELF 554 . m - number of rows 555 . n - number of columns 556 . nz - number of nonzeros per row (same for all rows) 557 - nnz - array containing the number of nonzeros in the various rows 558 (possibly different for each row) or NULL 559 560 Output Parameter: 561 . A - the matrix 562 563 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 564 MatXXXXSetPreallocation() paradigm instead of this routine directly. 565 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 566 567 Notes: 568 If nnz is given then nz is ignored 569 570 The AIJ format (also called the Yale sparse matrix format or 571 compressed row storage), is fully compatible with standard Fortran 77 572 storage. That is, the stored row and column indices can begin at 573 either one (as in Fortran) or zero. See the users' manual for details. 574 575 Specify the preallocated storage with either nz or nnz (not both). 576 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 577 allocation. For large problems you MUST preallocate memory or you 578 will get TERRIBLE performance, see the users' manual chapter on matrices. 579 580 By default, this format uses inodes (identical nodes) when possible, to 581 improve numerical efficiency of matrix-vector products and solves. We 582 search for consecutive rows with the same nonzero structure, thereby 583 reusing matrix information to achieve increased efficiency. 584 585 Level: intermediate 586 587 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 588 @*/ 589 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 590 { 591 PetscErrorCode ierr; 592 PetscMPIInt size; 593 594 PetscFunctionBegin; 595 ierr = MatCreate(comm,A);CHKERRQ(ierr); 596 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 597 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 598 if (size > 1) { 599 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 600 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 601 } else { 602 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 603 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 604 } 605 PetscFunctionReturn(0); 606 } 607 608 /*MC 609 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 610 611 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 612 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 613 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 614 615 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 616 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 617 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 618 for communicators controlling multiple processes. It is recommended that you call both of 619 the above preallocation routines for simplicity. 620 621 Options Database Keys: 622 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 623 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 624 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 625 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 626 627 Level: beginner 628 629 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 630 M 631 M*/ 632 633 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 634 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 635 { 636 #if defined(PETSC_USE_CTABLE) 637 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 638 #else 639 PetscSplitCSRDataStructure **p_d_mat; 640 PetscMPIInt size,rank; 641 MPI_Comm comm; 642 PetscErrorCode ierr; 643 int *ai,*bi,*aj,*bj; 644 PetscScalar *aa,*ba; 645 646 PetscFunctionBegin; 647 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 648 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 649 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 650 if (A->factortype == MAT_FACTOR_NONE) { 651 CsrMatrix *matrixA,*matrixB=NULL; 652 if (size == 1) { 653 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 654 p_d_mat = &cusparsestruct->deviceMat; 655 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 656 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 657 matrixA = (CsrMatrix*)matstruct->mat; 658 bi = bj = NULL; ba = NULL; 659 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 660 } else { 661 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 662 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 663 p_d_mat = &spptr->deviceMat; 664 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 665 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 666 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 667 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 668 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 669 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 670 matrixA = (CsrMatrix*)matstructA->mat; 671 matrixB = (CsrMatrix*)matstructB->mat; 672 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 673 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 674 ba = thrust::raw_pointer_cast(matrixB->values->data()); 675 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 676 } 677 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 678 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 679 aa = thrust::raw_pointer_cast(matrixA->values->data()); 680 } else { 681 *B = NULL; 682 PetscFunctionReturn(0); 683 } 684 // act like MatSetValues because not called on host 685 if (A->assembled) { 686 if (A->was_assembled) { 687 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 688 } 689 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 690 } else { 691 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 692 } 693 if (!*p_d_mat) { 694 cudaError_t err; 695 PetscSplitCSRDataStructure *d_mat, h_mat; 696 Mat_SeqAIJ *jaca; 697 PetscInt n = A->rmap->n, nnz; 698 // create and copy 699 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 700 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 701 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 702 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 703 if (size == 1) { 704 jaca = (Mat_SeqAIJ*)A->data; 705 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 706 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 707 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 708 h_mat.offdiag.a = NULL; 709 } else { 710 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 711 Mat_SeqAIJ *jacb; 712 jaca = (Mat_SeqAIJ*)aij->A->data; 713 jacb = (Mat_SeqAIJ*)aij->B->data; 714 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 715 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 716 // create colmap - this is ussually done (lazy) in MatSetValues 717 aij->donotstash = PETSC_TRUE; 718 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 719 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 720 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 721 aij->colmap[A->cmap->N] = -9; 722 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 723 { 724 PetscInt ii; 725 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 726 } 727 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 728 // allocate B copy data 729 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 730 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 731 nnz = jacb->i[n]; 732 733 if (jacb->compressedrow.use) { 734 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 735 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 736 } else h_mat.offdiag.i = bi; 737 h_mat.offdiag.j = bj; 738 h_mat.offdiag.a = ba; 739 740 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 741 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 742 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 743 h_mat.offdiag.n = n; 744 } 745 // allocate A copy data 746 nnz = jaca->i[n]; 747 h_mat.diag.n = n; 748 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 749 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 750 if (jaca->compressedrow.use) { 751 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 752 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 753 } else { 754 h_mat.diag.i = ai; 755 } 756 h_mat.diag.j = aj; 757 h_mat.diag.a = aa; 758 // copy pointers and metdata to device 759 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 760 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 761 } else { 762 *B = *p_d_mat; 763 } 764 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 765 PetscFunctionReturn(0); 766 #endif 767 } 768