1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 cudaError_t cerr; 29 30 PetscFunctionBegin; 31 if (cusp->coo_p && v) { 32 thrust::device_ptr<const PetscScalar> d_v; 33 THRUSTARRAY *w = NULL; 34 35 if (isCudaMem(v)) { 36 d_v = thrust::device_pointer_cast(v); 37 } else { 38 w = new THRUSTARRAY(n); 39 w->assign(v,v+n); 40 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 41 d_v = w->data(); 42 } 43 44 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 45 cusp->coo_pw->begin())); 46 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 47 cusp->coo_pw->end())); 48 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 49 thrust::for_each(zibit,zieit,VecCUDAEquals()); 50 cerr = WaitForCUDA();CHKERRCUDA(cerr); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 60 A->num_ass++; 61 A->assembled = PETSC_TRUE; 62 A->ass_nonzerostate = A->nonzerostate; 63 A->offloadmask = PETSC_OFFLOAD_GPU; 64 PetscFunctionReturn(0); 65 } 66 67 template <typename Tuple> 68 struct IsNotOffDiagT 69 { 70 PetscInt _cstart,_cend; 71 72 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 73 __host__ __device__ 74 inline bool operator()(Tuple t) 75 { 76 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 77 } 78 }; 79 80 struct IsOffDiag 81 { 82 PetscInt _cstart,_cend; 83 84 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 85 __host__ __device__ 86 inline bool operator() (const PetscInt &c) 87 { 88 return c < _cstart || c >= _cend; 89 } 90 }; 91 92 struct GlobToLoc 93 { 94 PetscInt _start; 95 96 GlobToLoc(PetscInt start) : _start(start) {} 97 __host__ __device__ 98 inline PetscInt operator() (const PetscInt &c) 99 { 100 return c - _start; 101 } 102 }; 103 104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 105 { 106 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 107 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 108 PetscErrorCode ierr; 109 PetscInt *jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 cudaError_t cerr; 115 116 PetscFunctionBegin; 117 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 118 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 119 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 120 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 121 ierr = PetscFree(b->garray);CHKERRQ(ierr); 122 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 123 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 124 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 125 126 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 127 d_i.assign(coo_i,coo_i+n); 128 d_j.assign(coo_j,coo_j+n); 129 delete cusp->coo_p; 130 delete cusp->coo_pw; 131 cusp->coo_p = NULL; 132 cusp->coo_pw = NULL; 133 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 134 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 135 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 136 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 137 cusp->coo_p = new THRUSTINTARRAY(n); 138 cusp->coo_pw = new THRUSTARRAY(n); 139 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 140 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 141 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 142 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 143 firstoffd = mzipp.get_iterator_tuple().get<1>(); 144 } 145 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 146 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 147 148 /* from global to local */ 149 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 150 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 151 cerr = WaitForCUDA();CHKERRCUDA(cerr); 152 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 153 154 /* copy offdiag column indices to map on the CPU */ 155 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 156 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 157 auto o_j = d_j.begin(); 158 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 159 thrust::advance(o_j,cusp->coo_nd); 160 thrust::sort(thrust::device,o_j,d_j.end()); 161 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 162 cerr = WaitForCUDA();CHKERRCUDA(cerr); 163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 164 noff = thrust::distance(o_j,wit); 165 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 166 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 167 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 168 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 169 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 170 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 171 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 172 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 173 174 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 175 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 176 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 177 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 178 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 179 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 180 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 181 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 182 183 /* GPU memory, cusparse specific call handles it internally */ 184 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 185 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 186 ierr = PetscFree(jj);CHKERRQ(ierr); 187 188 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 190 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 191 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 192 /* 193 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 194 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 195 */ 196 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 197 B->preallocated = PETSC_TRUE; 198 B->nonzerostate++; 199 200 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 201 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 202 B->offloadmask = PETSC_OFFLOAD_CPU; 203 B->assembled = PETSC_FALSE; 204 B->was_assembled = PETSC_FALSE; 205 PetscFunctionReturn(0); 206 } 207 208 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 209 { 210 Mat Ad,Ao; 211 const PetscInt *cmap; 212 PetscErrorCode ierr; 213 214 PetscFunctionBegin; 215 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 216 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 217 if (glob) { 218 PetscInt cst, i, dn, on, *gidx; 219 220 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 221 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 222 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 223 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 224 for (i=0; i<dn; i++) gidx[i] = cst + i; 225 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 226 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 227 } 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 232 { 233 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 234 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 235 PetscErrorCode ierr; 236 PetscInt i; 237 238 PetscFunctionBegin; 239 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 240 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 241 if (PetscDefined(USE_DEBUG) && d_nnz) { 242 for (i=0; i<B->rmap->n; i++) { 243 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 244 } 245 } 246 if (PetscDefined(USE_DEBUG) && o_nnz) { 247 for (i=0; i<B->rmap->n; i++) { 248 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 249 } 250 } 251 #if defined(PETSC_USE_CTABLE) 252 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 253 #else 254 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 255 #endif 256 ierr = PetscFree(b->garray);CHKERRQ(ierr); 257 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 258 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 259 /* Because the B will have been resized we simply destroy it and create a new one each time */ 260 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 261 if (!b->A) { 262 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 263 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 264 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 265 } 266 if (!b->B) { 267 PetscMPIInt size; 268 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 269 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 270 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 271 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 272 } 273 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 274 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 275 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 276 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 277 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 278 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 279 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 281 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 282 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 283 /* Let A, B use b's handle with pre-set stream 284 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 285 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 286 */ 287 B->preallocated = PETSC_TRUE; 288 PetscFunctionReturn(0); 289 } 290 291 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 292 { 293 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 294 PetscErrorCode ierr; 295 PetscInt nt; 296 297 PetscFunctionBegin; 298 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 299 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 300 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 301 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 302 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 303 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 304 PetscFunctionReturn(0); 305 } 306 307 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 308 { 309 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 310 PetscErrorCode ierr; 311 312 PetscFunctionBegin; 313 if (A->factortype == MAT_FACTOR_NONE) { 314 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 315 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 316 if (d_mat) { 317 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 318 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 319 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 320 cudaError_t err; 321 PetscScalar *vals; 322 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 323 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 324 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 325 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 326 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 327 } 328 } 329 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 330 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 331 332 PetscFunctionReturn(0); 333 } 334 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 335 { 336 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 337 PetscErrorCode ierr; 338 PetscInt nt; 339 340 PetscFunctionBegin; 341 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 342 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 343 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 344 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 345 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 346 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 347 PetscFunctionReturn(0); 348 } 349 350 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 351 { 352 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 353 PetscErrorCode ierr; 354 PetscInt nt; 355 356 PetscFunctionBegin; 357 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 358 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 359 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 360 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 361 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 362 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 363 PetscFunctionReturn(0); 364 } 365 366 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 367 { 368 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 369 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 370 371 PetscFunctionBegin; 372 switch (op) { 373 case MAT_CUSPARSE_MULT_DIAG: 374 cusparseStruct->diagGPUMatFormat = format; 375 break; 376 case MAT_CUSPARSE_MULT_OFFDIAG: 377 cusparseStruct->offdiagGPUMatFormat = format; 378 break; 379 case MAT_CUSPARSE_ALL: 380 cusparseStruct->diagGPUMatFormat = format; 381 cusparseStruct->offdiagGPUMatFormat = format; 382 break; 383 default: 384 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 385 } 386 PetscFunctionReturn(0); 387 } 388 389 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 390 { 391 MatCUSPARSEStorageFormat format; 392 PetscErrorCode ierr; 393 PetscBool flg; 394 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 395 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 396 397 PetscFunctionBegin; 398 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 399 if (A->factortype==MAT_FACTOR_NONE) { 400 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 401 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 402 if (flg) { 403 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 404 } 405 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 406 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 407 if (flg) { 408 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 409 } 410 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 411 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 412 if (flg) { 413 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 414 } 415 } 416 ierr = PetscOptionsTail();CHKERRQ(ierr); 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 421 { 422 PetscErrorCode ierr; 423 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 424 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 425 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 426 PetscFunctionBegin; 427 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 428 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 429 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 430 #if defined(PETSC_HAVE_NVSHMEM) 431 { 432 PetscMPIInt result; 433 PetscBool useNvshmem = PETSC_FALSE; 434 ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr); 435 if (useNvshmem) { 436 ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr); 437 if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);} 438 } 439 } 440 #endif 441 } 442 if (d_mat) { 443 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 444 } 445 PetscFunctionReturn(0); 446 } 447 448 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 449 { 450 PetscErrorCode ierr; 451 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 452 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 453 cudaError_t err; 454 cusparseStatus_t stat; 455 456 PetscFunctionBegin; 457 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 458 if (cusparseStruct->deviceMat) { 459 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 460 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 461 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 462 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 463 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 464 if (jaca->compressedrow.use) { 465 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 466 } 467 if (jacb->compressedrow.use) { 468 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 469 } 470 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 471 err = cudaFree(d_mat);CHKERRCUDA(err); 472 } 473 try { 474 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 475 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 476 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 477 /* We want cusparseStruct to use PetscDefaultCudaStream 478 if (cusparseStruct->stream) { 479 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 480 } 481 */ 482 delete cusparseStruct->coo_p; 483 delete cusparseStruct->coo_pw; 484 delete cusparseStruct; 485 } catch(char *ex) { 486 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 487 } 488 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 489 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 490 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 491 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 492 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 493 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 494 PetscFunctionReturn(0); 495 } 496 497 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 498 { 499 PetscErrorCode ierr; 500 Mat_MPIAIJ *a; 501 Mat_MPIAIJCUSPARSE *cusparseStruct; 502 cusparseStatus_t stat; 503 Mat A; 504 505 PetscFunctionBegin; 506 if (reuse == MAT_INITIAL_MATRIX) { 507 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 508 } else if (reuse == MAT_REUSE_MATRIX) { 509 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 510 } 511 A = *newmat; 512 A->boundtocpu = PETSC_FALSE; 513 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 514 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 515 516 a = (Mat_MPIAIJ*)A->data; 517 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 518 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 519 if (a->lvec) { 520 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 521 } 522 523 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 524 a->spptr = new Mat_MPIAIJCUSPARSE; 525 526 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 527 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 528 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 529 cusparseStruct->coo_p = NULL; 530 cusparseStruct->coo_pw = NULL; 531 cusparseStruct->stream = 0; /* We should not need cusparseStruct->stream */ 532 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 533 stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 534 cusparseStruct->deviceMat = NULL; 535 } 536 537 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 538 A->ops->mult = MatMult_MPIAIJCUSPARSE; 539 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 540 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 541 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 542 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 543 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 544 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 545 546 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 547 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 548 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 549 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 552 PetscFunctionReturn(0); 553 } 554 555 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 556 { 557 PetscErrorCode ierr; 558 559 PetscFunctionBegin; 560 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 561 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 562 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 563 PetscFunctionReturn(0); 564 } 565 566 /*@ 567 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 568 (the default parallel PETSc format). This matrix will ultimately pushed down 569 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 570 assembly performance the user should preallocate the matrix storage by setting 571 the parameter nz (or the array nnz). By setting these parameters accurately, 572 performance during matrix assembly can be increased by more than a factor of 50. 573 574 Collective 575 576 Input Parameters: 577 + comm - MPI communicator, set to PETSC_COMM_SELF 578 . m - number of rows 579 . n - number of columns 580 . nz - number of nonzeros per row (same for all rows) 581 - nnz - array containing the number of nonzeros in the various rows 582 (possibly different for each row) or NULL 583 584 Output Parameter: 585 . A - the matrix 586 587 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 588 MatXXXXSetPreallocation() paradigm instead of this routine directly. 589 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 590 591 Notes: 592 If nnz is given then nz is ignored 593 594 The AIJ format (also called the Yale sparse matrix format or 595 compressed row storage), is fully compatible with standard Fortran 77 596 storage. That is, the stored row and column indices can begin at 597 either one (as in Fortran) or zero. See the users' manual for details. 598 599 Specify the preallocated storage with either nz or nnz (not both). 600 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 601 allocation. For large problems you MUST preallocate memory or you 602 will get TERRIBLE performance, see the users' manual chapter on matrices. 603 604 By default, this format uses inodes (identical nodes) when possible, to 605 improve numerical efficiency of matrix-vector products and solves. We 606 search for consecutive rows with the same nonzero structure, thereby 607 reusing matrix information to achieve increased efficiency. 608 609 Level: intermediate 610 611 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 612 @*/ 613 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 614 { 615 PetscErrorCode ierr; 616 PetscMPIInt size; 617 618 PetscFunctionBegin; 619 ierr = MatCreate(comm,A);CHKERRQ(ierr); 620 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 621 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 622 if (size > 1) { 623 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 624 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 625 } else { 626 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 627 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 628 } 629 PetscFunctionReturn(0); 630 } 631 632 /*MC 633 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 634 635 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 636 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 637 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 638 639 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 640 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 641 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 642 for communicators controlling multiple processes. It is recommended that you call both of 643 the above preallocation routines for simplicity. 644 645 Options Database Keys: 646 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 647 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 648 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 649 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 650 651 Level: beginner 652 653 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 654 M 655 M*/ 656 657 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 658 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 659 { 660 #if defined(PETSC_USE_CTABLE) 661 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 662 #else 663 PetscSplitCSRDataStructure **p_d_mat; 664 PetscMPIInt size,rank; 665 MPI_Comm comm; 666 PetscErrorCode ierr; 667 int *ai,*bi,*aj,*bj; 668 PetscScalar *aa,*ba; 669 670 PetscFunctionBegin; 671 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 672 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 673 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 674 if (A->factortype == MAT_FACTOR_NONE) { 675 CsrMatrix *matrixA,*matrixB=NULL; 676 if (size == 1) { 677 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 678 p_d_mat = &cusparsestruct->deviceMat; 679 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 680 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 681 matrixA = (CsrMatrix*)matstruct->mat; 682 bi = bj = NULL; ba = NULL; 683 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 684 } else { 685 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 686 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 687 p_d_mat = &spptr->deviceMat; 688 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 689 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 690 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 691 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 692 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 693 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 694 matrixA = (CsrMatrix*)matstructA->mat; 695 matrixB = (CsrMatrix*)matstructB->mat; 696 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 697 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 698 ba = thrust::raw_pointer_cast(matrixB->values->data()); 699 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 700 } 701 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 702 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 703 aa = thrust::raw_pointer_cast(matrixA->values->data()); 704 } else { 705 *B = NULL; 706 PetscFunctionReturn(0); 707 } 708 // act like MatSetValues because not called on host 709 if (A->assembled) { 710 if (A->was_assembled) { 711 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 712 } 713 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 714 } else { 715 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 716 } 717 if (!*p_d_mat) { 718 cudaError_t err; 719 PetscSplitCSRDataStructure *d_mat, h_mat; 720 Mat_SeqAIJ *jaca; 721 PetscInt n = A->rmap->n, nnz; 722 // create and copy 723 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 724 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 725 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 726 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 727 if (size == 1) { 728 jaca = (Mat_SeqAIJ*)A->data; 729 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 730 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 731 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 732 h_mat.offdiag.a = NULL; 733 } else { 734 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 735 Mat_SeqAIJ *jacb; 736 jaca = (Mat_SeqAIJ*)aij->A->data; 737 jacb = (Mat_SeqAIJ*)aij->B->data; 738 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 739 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 740 // create colmap - this is ussually done (lazy) in MatSetValues 741 aij->donotstash = PETSC_TRUE; 742 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 743 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 744 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 745 aij->colmap[A->cmap->N] = -9; 746 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 747 { 748 PetscInt ii; 749 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 750 } 751 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 752 // allocate B copy data 753 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 754 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 755 nnz = jacb->i[n]; 756 757 if (jacb->compressedrow.use) { 758 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 759 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 760 } else h_mat.offdiag.i = bi; 761 h_mat.offdiag.j = bj; 762 h_mat.offdiag.a = ba; 763 764 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 765 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 766 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 767 h_mat.offdiag.n = n; 768 } 769 // allocate A copy data 770 nnz = jaca->i[n]; 771 h_mat.diag.n = n; 772 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 773 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 774 if (jaca->compressedrow.use) { 775 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 776 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 777 } else { 778 h_mat.diag.i = ai; 779 } 780 h_mat.diag.j = aj; 781 h_mat.diag.a = aa; 782 // copy pointers and metdata to device 783 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 784 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 785 } else { 786 *B = *p_d_mat; 787 } 788 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 789 PetscFunctionReturn(0); 790 #endif 791 } 792