1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 cudaError_t cerr; 29 30 PetscFunctionBegin; 31 if (cusp->coo_p && v) { 32 thrust::device_ptr<const PetscScalar> d_v; 33 THRUSTARRAY *w = NULL; 34 35 if (isCudaMem(v)) { 36 d_v = thrust::device_pointer_cast(v); 37 } else { 38 w = new THRUSTARRAY(n); 39 w->assign(v,v+n); 40 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 41 d_v = w->data(); 42 } 43 44 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 45 cusp->coo_pw->begin())); 46 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 47 cusp->coo_pw->end())); 48 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 49 thrust::for_each(zibit,zieit,VecCUDAEquals()); 50 cerr = WaitForCUDA();CHKERRCUDA(cerr); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 60 A->num_ass++; 61 A->assembled = PETSC_TRUE; 62 A->ass_nonzerostate = A->nonzerostate; 63 A->offloadmask = PETSC_OFFLOAD_GPU; 64 PetscFunctionReturn(0); 65 } 66 67 template <typename Tuple> 68 struct IsNotOffDiagT 69 { 70 PetscInt _cstart,_cend; 71 72 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 73 __host__ __device__ 74 inline bool operator()(Tuple t) 75 { 76 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 77 } 78 }; 79 80 struct IsOffDiag 81 { 82 PetscInt _cstart,_cend; 83 84 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 85 __host__ __device__ 86 inline bool operator() (const PetscInt &c) 87 { 88 return c < _cstart || c >= _cend; 89 } 90 }; 91 92 struct GlobToLoc 93 { 94 PetscInt _start; 95 96 GlobToLoc(PetscInt start) : _start(start) {} 97 __host__ __device__ 98 inline PetscInt operator() (const PetscInt &c) 99 { 100 return c - _start; 101 } 102 }; 103 104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 105 { 106 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 107 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 108 PetscErrorCode ierr; 109 PetscInt *jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 cudaError_t cerr; 115 116 PetscFunctionBegin; 117 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 118 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 119 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 120 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 121 ierr = PetscFree(b->garray);CHKERRQ(ierr); 122 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 123 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 124 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 125 126 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 127 d_i.assign(coo_i,coo_i+n); 128 d_j.assign(coo_j,coo_j+n); 129 delete cusp->coo_p; 130 delete cusp->coo_pw; 131 cusp->coo_p = NULL; 132 cusp->coo_pw = NULL; 133 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 134 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 135 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 136 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 137 cusp->coo_p = new THRUSTINTARRAY(n); 138 cusp->coo_pw = new THRUSTARRAY(n); 139 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 140 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 141 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 142 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 143 firstoffd = mzipp.get_iterator_tuple().get<1>(); 144 } 145 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 146 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 147 148 /* from global to local */ 149 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 150 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 151 cerr = WaitForCUDA();CHKERRCUDA(cerr); 152 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 153 154 /* copy offdiag column indices to map on the CPU */ 155 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 156 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 157 auto o_j = d_j.begin(); 158 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 159 thrust::advance(o_j,cusp->coo_nd); 160 thrust::sort(thrust::device,o_j,d_j.end()); 161 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 162 cerr = WaitForCUDA();CHKERRCUDA(cerr); 163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 164 noff = thrust::distance(o_j,wit); 165 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 166 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 167 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 168 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 169 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 170 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 171 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 172 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 173 174 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 175 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 176 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 177 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 178 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 179 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 180 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 181 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 182 183 /* GPU memory, cusparse specific call handles it internally */ 184 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 185 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 186 ierr = PetscFree(jj);CHKERRQ(ierr); 187 188 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 190 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 191 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 192 /* 193 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 194 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 195 */ 196 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 197 B->preallocated = PETSC_TRUE; 198 B->nonzerostate++; 199 200 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 201 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 202 B->offloadmask = PETSC_OFFLOAD_CPU; 203 B->assembled = PETSC_FALSE; 204 B->was_assembled = PETSC_FALSE; 205 PetscFunctionReturn(0); 206 } 207 208 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 209 { 210 Mat Ad,Ao; 211 const PetscInt *cmap; 212 PetscErrorCode ierr; 213 214 PetscFunctionBegin; 215 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 216 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 217 if (glob) { 218 PetscInt cst, i, dn, on, *gidx; 219 220 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 221 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 222 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 223 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 224 for (i=0; i<dn; i++) gidx[i] = cst + i; 225 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 226 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 227 } 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 232 { 233 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 234 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 235 PetscErrorCode ierr; 236 PetscInt i; 237 238 PetscFunctionBegin; 239 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 240 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 241 if (PetscDefined(USE_DEBUG) && d_nnz) { 242 for (i=0; i<B->rmap->n; i++) { 243 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 244 } 245 } 246 if (PetscDefined(USE_DEBUG) && o_nnz) { 247 for (i=0; i<B->rmap->n; i++) { 248 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 249 } 250 } 251 #if defined(PETSC_USE_CTABLE) 252 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 253 #else 254 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 255 #endif 256 ierr = PetscFree(b->garray);CHKERRQ(ierr); 257 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 258 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 259 /* Because the B will have been resized we simply destroy it and create a new one each time */ 260 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 261 if (!b->A) { 262 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 263 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 264 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 265 } 266 if (!b->B) { 267 PetscMPIInt size; 268 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 269 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 270 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 271 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 272 } 273 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 274 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 275 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 276 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 277 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 278 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 279 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 281 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 282 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 283 /* Let A, B use b's handle with pre-set stream 284 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 285 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 286 */ 287 B->preallocated = PETSC_TRUE; 288 PetscFunctionReturn(0); 289 } 290 291 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 292 { 293 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 294 PetscErrorCode ierr; 295 PetscInt nt; 296 297 PetscFunctionBegin; 298 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 299 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 300 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 301 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 302 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 303 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 304 PetscFunctionReturn(0); 305 } 306 307 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 308 { 309 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 310 PetscErrorCode ierr; 311 312 PetscFunctionBegin; 313 if (A->factortype == MAT_FACTOR_NONE) { 314 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 315 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 316 if (d_mat) { 317 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 318 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 319 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 320 cudaError_t err; 321 PetscScalar *vals; 322 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 323 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 324 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 325 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 326 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 327 } 328 } 329 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 330 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 331 332 PetscFunctionReturn(0); 333 } 334 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 335 { 336 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 337 PetscErrorCode ierr; 338 PetscInt nt; 339 340 PetscFunctionBegin; 341 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 342 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 343 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 344 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 345 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 346 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 347 PetscFunctionReturn(0); 348 } 349 350 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 351 { 352 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 353 PetscErrorCode ierr; 354 PetscInt nt; 355 356 PetscFunctionBegin; 357 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 358 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 359 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 360 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 361 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 362 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 363 PetscFunctionReturn(0); 364 } 365 366 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 367 { 368 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 369 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 370 371 PetscFunctionBegin; 372 switch (op) { 373 case MAT_CUSPARSE_MULT_DIAG: 374 cusparseStruct->diagGPUMatFormat = format; 375 break; 376 case MAT_CUSPARSE_MULT_OFFDIAG: 377 cusparseStruct->offdiagGPUMatFormat = format; 378 break; 379 case MAT_CUSPARSE_ALL: 380 cusparseStruct->diagGPUMatFormat = format; 381 cusparseStruct->offdiagGPUMatFormat = format; 382 break; 383 default: 384 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 385 } 386 PetscFunctionReturn(0); 387 } 388 389 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 390 { 391 MatCUSPARSEStorageFormat format; 392 PetscErrorCode ierr; 393 PetscBool flg; 394 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 395 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 396 397 PetscFunctionBegin; 398 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 399 if (A->factortype==MAT_FACTOR_NONE) { 400 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 401 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 402 if (flg) { 403 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 404 } 405 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 406 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 407 if (flg) { 408 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 409 } 410 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 411 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 412 if (flg) { 413 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 414 } 415 } 416 ierr = PetscOptionsTail();CHKERRQ(ierr); 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 421 { 422 PetscErrorCode ierr; 423 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 424 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 425 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 426 PetscFunctionBegin; 427 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 428 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 429 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 430 #if defined(PETSC_HAVE_NVSHMEM) 431 { 432 PetscMPIInt result; 433 PetscBool useNvshmem = PETSC_FALSE; 434 ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr); 435 if (useNvshmem) { 436 ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr); 437 if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);} 438 } 439 } 440 #endif 441 } 442 if (d_mat) { 443 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 444 } 445 PetscFunctionReturn(0); 446 } 447 448 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 449 { 450 PetscErrorCode ierr; 451 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 452 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 453 cudaError_t err; 454 cusparseStatus_t stat; 455 456 PetscFunctionBegin; 457 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 458 if (cusparseStruct->deviceMat) { 459 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 460 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 461 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 462 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 463 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 464 if (jaca->compressedrow.use) { 465 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 466 } 467 if (jacb->compressedrow.use) { 468 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 469 } 470 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 471 err = cudaFree(d_mat);CHKERRCUDA(err); 472 } 473 try { 474 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 475 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 476 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 477 /* We want cusparseStruct to use PetscDefaultCudaStream 478 if (cusparseStruct->stream) { 479 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 480 } 481 */ 482 delete cusparseStruct->coo_p; 483 delete cusparseStruct->coo_pw; 484 delete cusparseStruct; 485 } catch(char *ex) { 486 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 487 } 488 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 489 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 490 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 491 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 492 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 493 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 494 PetscFunctionReturn(0); 495 } 496 497 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 498 { 499 PetscErrorCode ierr; 500 Mat_MPIAIJ *a; 501 Mat_MPIAIJCUSPARSE *cusparseStruct; 502 cusparseStatus_t stat; 503 Mat A; 504 505 PetscFunctionBegin; 506 if (reuse == MAT_INITIAL_MATRIX) { 507 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 508 } else if (reuse == MAT_REUSE_MATRIX) { 509 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 510 } 511 A = *newmat; 512 A->boundtocpu = PETSC_FALSE; 513 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 514 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 515 516 a = (Mat_MPIAIJ*)A->data; 517 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 518 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 519 if (a->lvec) { 520 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 521 } 522 523 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 524 a->spptr = new Mat_MPIAIJCUSPARSE; 525 526 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 527 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 528 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 529 cusparseStruct->coo_p = NULL; 530 cusparseStruct->coo_pw = NULL; 531 cusparseStruct->stream = 0; /* We should not need cusparseStruct->stream */ 532 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 533 stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 534 cusparseStruct->deviceMat = NULL; 535 } 536 537 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 538 A->ops->mult = MatMult_MPIAIJCUSPARSE; 539 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 540 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 541 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 542 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 543 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 544 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 545 546 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 547 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 548 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 549 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 552 PetscFunctionReturn(0); 553 } 554 555 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 556 { 557 PetscErrorCode ierr; 558 559 PetscFunctionBegin; 560 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 561 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 562 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 563 PetscFunctionReturn(0); 564 } 565 566 /*@ 567 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 568 (the default parallel PETSc format). This matrix will ultimately pushed down 569 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 570 assembly performance the user should preallocate the matrix storage by setting 571 the parameter nz (or the array nnz). By setting these parameters accurately, 572 performance during matrix assembly can be increased by more than a factor of 50. 573 574 Collective 575 576 Input Parameters: 577 + comm - MPI communicator, set to PETSC_COMM_SELF 578 . m - number of rows 579 . n - number of columns 580 . nz - number of nonzeros per row (same for all rows) 581 - nnz - array containing the number of nonzeros in the various rows 582 (possibly different for each row) or NULL 583 584 Output Parameter: 585 . A - the matrix 586 587 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 588 MatXXXXSetPreallocation() paradigm instead of this routine directly. 589 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 590 591 Notes: 592 If nnz is given then nz is ignored 593 594 The AIJ format (also called the Yale sparse matrix format or 595 compressed row storage), is fully compatible with standard Fortran 77 596 storage. That is, the stored row and column indices can begin at 597 either one (as in Fortran) or zero. See the users' manual for details. 598 599 Specify the preallocated storage with either nz or nnz (not both). 600 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 601 allocation. For large problems you MUST preallocate memory or you 602 will get TERRIBLE performance, see the users' manual chapter on matrices. 603 604 By default, this format uses inodes (identical nodes) when possible, to 605 improve numerical efficiency of matrix-vector products and solves. We 606 search for consecutive rows with the same nonzero structure, thereby 607 reusing matrix information to achieve increased efficiency. 608 609 Level: intermediate 610 611 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 612 @*/ 613 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 614 { 615 PetscErrorCode ierr; 616 PetscMPIInt size; 617 618 PetscFunctionBegin; 619 ierr = MatCreate(comm,A);CHKERRQ(ierr); 620 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 621 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 622 if (size > 1) { 623 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 624 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 625 } else { 626 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 627 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 628 } 629 PetscFunctionReturn(0); 630 } 631 632 /*MC 633 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 634 635 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 636 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 637 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 638 639 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 640 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 641 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 642 for communicators controlling multiple processes. It is recommended that you call both of 643 the above preallocation routines for simplicity. 644 645 Options Database Keys: 646 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 647 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 648 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 649 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 650 651 Level: beginner 652 653 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 654 M*/ 655 656 /*MC 657 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 658 659 Level: beginner 660 661 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 662 M*/ 663 664 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 665 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 666 { 667 #if defined(PETSC_USE_CTABLE) 668 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 669 #else 670 PetscSplitCSRDataStructure **p_d_mat; 671 PetscMPIInt size,rank; 672 MPI_Comm comm; 673 PetscErrorCode ierr; 674 int *ai,*bi,*aj,*bj; 675 PetscScalar *aa,*ba; 676 677 PetscFunctionBegin; 678 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 679 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 680 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 681 if (A->factortype == MAT_FACTOR_NONE) { 682 CsrMatrix *matrixA,*matrixB=NULL; 683 if (size == 1) { 684 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 685 p_d_mat = &cusparsestruct->deviceMat; 686 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 687 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 688 matrixA = (CsrMatrix*)matstruct->mat; 689 bi = bj = NULL; ba = NULL; 690 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 691 } else { 692 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 693 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 694 p_d_mat = &spptr->deviceMat; 695 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 696 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 697 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 698 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 699 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 700 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 701 matrixA = (CsrMatrix*)matstructA->mat; 702 matrixB = (CsrMatrix*)matstructB->mat; 703 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 704 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 705 ba = thrust::raw_pointer_cast(matrixB->values->data()); 706 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 707 } 708 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 709 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 710 aa = thrust::raw_pointer_cast(matrixA->values->data()); 711 } else { 712 *B = NULL; 713 PetscFunctionReturn(0); 714 } 715 // act like MatSetValues because not called on host 716 if (A->assembled) { 717 if (A->was_assembled) { 718 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 719 } 720 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 721 } else { 722 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 723 } 724 if (!*p_d_mat) { 725 cudaError_t err; 726 PetscSplitCSRDataStructure *d_mat, h_mat; 727 Mat_SeqAIJ *jaca; 728 PetscInt n = A->rmap->n, nnz; 729 // create and copy 730 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 731 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 732 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 733 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 734 if (size == 1) { 735 jaca = (Mat_SeqAIJ*)A->data; 736 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 737 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 738 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 739 h_mat.offdiag.a = NULL; 740 } else { 741 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 742 Mat_SeqAIJ *jacb; 743 jaca = (Mat_SeqAIJ*)aij->A->data; 744 jacb = (Mat_SeqAIJ*)aij->B->data; 745 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 746 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 747 // create colmap - this is ussually done (lazy) in MatSetValues 748 aij->donotstash = PETSC_TRUE; 749 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 750 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 751 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 752 aij->colmap[A->cmap->N] = -9; 753 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 754 { 755 PetscInt ii; 756 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 757 } 758 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 759 // allocate B copy data 760 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 761 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 762 nnz = jacb->i[n]; 763 764 if (jacb->compressedrow.use) { 765 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 766 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 767 } else h_mat.offdiag.i = bi; 768 h_mat.offdiag.j = bj; 769 h_mat.offdiag.a = ba; 770 771 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 772 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 773 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 774 h_mat.offdiag.n = n; 775 } 776 // allocate A copy data 777 nnz = jaca->i[n]; 778 h_mat.diag.n = n; 779 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 780 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 781 if (jaca->compressedrow.use) { 782 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 783 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 784 } else { 785 h_mat.diag.i = ai; 786 } 787 h_mat.diag.j = aj; 788 h_mat.diag.a = aa; 789 // copy pointers and metdata to device 790 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 791 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 792 } else { 793 *B = *p_d_mat; 794 } 795 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 796 PetscFunctionReturn(0); 797 #endif 798 } 799