1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <petscsf.h> 10 11 struct VecCUDAEquals 12 { 13 template <typename Tuple> 14 __host__ __device__ 15 void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 22 { 23 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 24 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 25 PetscInt n = cusp->coo_nd + cusp->coo_no; 26 PetscErrorCode ierr; 27 28 PetscFunctionBegin; 29 if (cusp->coo_p && v) { 30 thrust::device_ptr<const PetscScalar> d_v; 31 THRUSTARRAY *w = NULL; 32 33 if (isCudaMem(v)) { 34 d_v = thrust::device_pointer_cast(v); 35 } else { 36 w = new THRUSTARRAY(n); 37 w->assign(v,v+n); 38 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 39 d_v = w->data(); 40 } 41 42 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 43 cusp->coo_pw->begin())); 44 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 45 cusp->coo_pw->end())); 46 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 47 thrust::for_each(zibit,zieit,VecCUDAEquals()); 48 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 49 delete w; 50 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 51 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 52 } else { 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 55 } 56 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 57 A->num_ass++; 58 A->assembled = PETSC_TRUE; 59 A->ass_nonzerostate = A->nonzerostate; 60 A->offloadmask = PETSC_OFFLOAD_GPU; 61 PetscFunctionReturn(0); 62 } 63 64 template <typename Tuple> 65 struct IsNotOffDiagT 66 { 67 PetscInt _cstart,_cend; 68 69 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 70 __host__ __device__ 71 inline bool operator()(Tuple t) 72 { 73 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 74 } 75 }; 76 77 struct IsOffDiag 78 { 79 PetscInt _cstart,_cend; 80 81 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 82 __host__ __device__ 83 inline bool operator() (const PetscInt &c) 84 { 85 return c < _cstart || c >= _cend; 86 } 87 }; 88 89 struct GlobToLoc 90 { 91 PetscInt _start; 92 93 GlobToLoc(PetscInt start) : _start(start) {} 94 __host__ __device__ 95 inline PetscInt operator() (const PetscInt &c) 96 { 97 return c - _start; 98 } 99 }; 100 101 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 102 { 103 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 104 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 105 PetscErrorCode ierr; 106 PetscInt *jj; 107 size_t noff = 0; 108 THRUSTINTARRAY d_i(n); 109 THRUSTINTARRAY d_j(n); 110 ISLocalToGlobalMapping l2g; 111 cudaError_t cerr; 112 113 PetscFunctionBegin; 114 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 115 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 116 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 117 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 118 ierr = PetscFree(b->garray);CHKERRQ(ierr); 119 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 120 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 121 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 122 123 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 124 d_i.assign(coo_i,coo_i+n); 125 d_j.assign(coo_j,coo_j+n); 126 delete cusp->coo_p; 127 delete cusp->coo_pw; 128 cusp->coo_p = NULL; 129 cusp->coo_pw = NULL; 130 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 131 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 132 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 133 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 134 cusp->coo_p = new THRUSTINTARRAY(n); 135 cusp->coo_pw = new THRUSTARRAY(n); 136 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 137 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 138 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 139 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 140 firstoffd = mzipp.get_iterator_tuple().get<1>(); 141 } 142 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 143 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 144 145 /* from global to local */ 146 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 147 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 148 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 149 150 /* copy offdiag column indices to map on the CPU */ 151 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 152 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 153 auto o_j = d_j.begin(); 154 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 155 thrust::advance(o_j,cusp->coo_nd); 156 thrust::sort(thrust::device,o_j,d_j.end()); 157 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 158 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 159 noff = thrust::distance(o_j,wit); 160 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 161 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 162 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 163 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 164 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 165 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 166 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 167 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 168 169 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 170 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 171 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 172 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 173 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 174 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 175 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 176 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 177 178 /* GPU memory, cusparse specific call handles it internally */ 179 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 180 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 181 ierr = PetscFree(jj);CHKERRQ(ierr); 182 183 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 184 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 185 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 186 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 187 /* 188 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 190 */ 191 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 192 B->preallocated = PETSC_TRUE; 193 B->nonzerostate++; 194 195 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 196 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 197 B->offloadmask = PETSC_OFFLOAD_CPU; 198 B->assembled = PETSC_FALSE; 199 B->was_assembled = PETSC_FALSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 204 { 205 Mat Ad,Ao; 206 const PetscInt *cmap; 207 PetscErrorCode ierr; 208 209 PetscFunctionBegin; 210 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 211 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 212 if (glob) { 213 PetscInt cst, i, dn, on, *gidx; 214 215 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 216 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 217 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 218 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 219 for (i=0; i<dn; i++) gidx[i] = cst + i; 220 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 221 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 227 { 228 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 229 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 230 PetscErrorCode ierr; 231 PetscInt i; 232 233 PetscFunctionBegin; 234 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 235 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 236 if (PetscDefined(USE_DEBUG) && d_nnz) { 237 for (i=0; i<B->rmap->n; i++) { 238 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239 } 240 } 241 if (PetscDefined(USE_DEBUG) && o_nnz) { 242 for (i=0; i<B->rmap->n; i++) { 243 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 244 } 245 } 246 #if defined(PETSC_USE_CTABLE) 247 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 248 #else 249 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 250 #endif 251 ierr = PetscFree(b->garray);CHKERRQ(ierr); 252 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 253 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 254 /* Because the B will have been resized we simply destroy it and create a new one each time */ 255 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 256 if (!b->A) { 257 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 258 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 259 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 260 } 261 if (!b->B) { 262 PetscMPIInt size; 263 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 264 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 265 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 266 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 267 } 268 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 269 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 270 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 271 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 272 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 273 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 274 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 275 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 276 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 277 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 278 /* Let A, B use b's handle with pre-set stream 279 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 281 */ 282 B->preallocated = PETSC_TRUE; 283 PetscFunctionReturn(0); 284 } 285 286 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 287 { 288 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 289 PetscErrorCode ierr; 290 PetscInt nt; 291 292 PetscFunctionBegin; 293 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 294 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 295 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 296 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 297 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 298 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 299 PetscFunctionReturn(0); 300 } 301 302 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 303 { 304 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 305 PetscErrorCode ierr; 306 307 PetscFunctionBegin; 308 if (A->factortype == MAT_FACTOR_NONE) { 309 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 310 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 311 if (d_mat) { 312 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 313 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 314 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 315 cudaError_t err; 316 PetscScalar *vals; 317 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 318 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 319 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 320 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 321 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 322 } 323 } 324 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 325 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 326 327 PetscFunctionReturn(0); 328 } 329 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 330 { 331 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 332 PetscErrorCode ierr; 333 PetscInt nt; 334 335 PetscFunctionBegin; 336 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 337 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 338 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 339 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 340 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 341 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 342 PetscFunctionReturn(0); 343 } 344 345 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 346 { 347 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 348 PetscErrorCode ierr; 349 PetscInt nt; 350 351 PetscFunctionBegin; 352 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 353 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 354 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 355 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 356 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 357 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 358 PetscFunctionReturn(0); 359 } 360 361 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 362 { 363 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 364 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 365 366 PetscFunctionBegin; 367 switch (op) { 368 case MAT_CUSPARSE_MULT_DIAG: 369 cusparseStruct->diagGPUMatFormat = format; 370 break; 371 case MAT_CUSPARSE_MULT_OFFDIAG: 372 cusparseStruct->offdiagGPUMatFormat = format; 373 break; 374 case MAT_CUSPARSE_ALL: 375 cusparseStruct->diagGPUMatFormat = format; 376 cusparseStruct->offdiagGPUMatFormat = format; 377 break; 378 default: 379 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 380 } 381 PetscFunctionReturn(0); 382 } 383 384 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 385 { 386 MatCUSPARSEStorageFormat format; 387 PetscErrorCode ierr; 388 PetscBool flg; 389 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 390 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 391 392 PetscFunctionBegin; 393 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 394 if (A->factortype==MAT_FACTOR_NONE) { 395 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 396 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 397 if (flg) { 398 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 399 } 400 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 401 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 402 if (flg) { 403 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 404 } 405 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 406 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 407 if (flg) { 408 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 409 } 410 } 411 ierr = PetscOptionsTail();CHKERRQ(ierr); 412 PetscFunctionReturn(0); 413 } 414 415 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 416 { 417 PetscErrorCode ierr; 418 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 419 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 420 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 421 PetscFunctionBegin; 422 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 423 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 424 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 425 #if defined(PETSC_HAVE_NVSHMEM) 426 { 427 PetscMPIInt result; 428 PetscBool useNvshmem = PETSC_FALSE; 429 ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr); 430 if (useNvshmem) { 431 ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr); 432 if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);} 433 } 434 } 435 #endif 436 } 437 if (d_mat) { 438 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 439 } 440 PetscFunctionReturn(0); 441 } 442 443 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 444 { 445 PetscErrorCode ierr; 446 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 447 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 448 cudaError_t err; 449 cusparseStatus_t stat; 450 451 PetscFunctionBegin; 452 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 453 if (cusparseStruct->deviceMat) { 454 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 455 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 456 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 457 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 458 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 459 if (jaca->compressedrow.use) { 460 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 461 } 462 if (jacb->compressedrow.use) { 463 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 464 } 465 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 466 err = cudaFree(d_mat);CHKERRCUDA(err); 467 } 468 try { 469 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 470 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 471 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 472 /* We want cusparseStruct to use PetscDefaultCudaStream 473 if (cusparseStruct->stream) { 474 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 475 } 476 */ 477 delete cusparseStruct->coo_p; 478 delete cusparseStruct->coo_pw; 479 delete cusparseStruct; 480 } catch(char *ex) { 481 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 482 } 483 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 484 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 485 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 486 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 487 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 488 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 489 PetscFunctionReturn(0); 490 } 491 492 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 493 { 494 PetscErrorCode ierr; 495 Mat_MPIAIJ *a; 496 Mat_MPIAIJCUSPARSE *cusparseStruct; 497 cusparseStatus_t stat; 498 Mat A; 499 500 PetscFunctionBegin; 501 if (reuse == MAT_INITIAL_MATRIX) { 502 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 503 } else if (reuse == MAT_REUSE_MATRIX) { 504 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 505 } 506 A = *newmat; 507 A->boundtocpu = PETSC_FALSE; 508 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 509 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 510 511 a = (Mat_MPIAIJ*)A->data; 512 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 513 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 514 if (a->lvec) { 515 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 516 } 517 518 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 519 a->spptr = new Mat_MPIAIJCUSPARSE; 520 521 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 522 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 523 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 524 cusparseStruct->coo_p = NULL; 525 cusparseStruct->coo_pw = NULL; 526 cusparseStruct->stream = 0; /* We should not need cusparseStruct->stream */ 527 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 528 stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 529 cusparseStruct->deviceMat = NULL; 530 } 531 532 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 533 A->ops->mult = MatMult_MPIAIJCUSPARSE; 534 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 535 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 536 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 537 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 538 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 539 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 540 541 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 542 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 543 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 544 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 545 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 546 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 547 PetscFunctionReturn(0); 548 } 549 550 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 551 { 552 PetscErrorCode ierr; 553 554 PetscFunctionBegin; 555 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 556 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 557 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 558 PetscFunctionReturn(0); 559 } 560 561 /*@ 562 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 563 (the default parallel PETSc format). This matrix will ultimately pushed down 564 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 565 assembly performance the user should preallocate the matrix storage by setting 566 the parameter nz (or the array nnz). By setting these parameters accurately, 567 performance during matrix assembly can be increased by more than a factor of 50. 568 569 Collective 570 571 Input Parameters: 572 + comm - MPI communicator, set to PETSC_COMM_SELF 573 . m - number of rows 574 . n - number of columns 575 . nz - number of nonzeros per row (same for all rows) 576 - nnz - array containing the number of nonzeros in the various rows 577 (possibly different for each row) or NULL 578 579 Output Parameter: 580 . A - the matrix 581 582 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 583 MatXXXXSetPreallocation() paradigm instead of this routine directly. 584 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 585 586 Notes: 587 If nnz is given then nz is ignored 588 589 The AIJ format (also called the Yale sparse matrix format or 590 compressed row storage), is fully compatible with standard Fortran 77 591 storage. That is, the stored row and column indices can begin at 592 either one (as in Fortran) or zero. See the users' manual for details. 593 594 Specify the preallocated storage with either nz or nnz (not both). 595 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 596 allocation. For large problems you MUST preallocate memory or you 597 will get TERRIBLE performance, see the users' manual chapter on matrices. 598 599 By default, this format uses inodes (identical nodes) when possible, to 600 improve numerical efficiency of matrix-vector products and solves. We 601 search for consecutive rows with the same nonzero structure, thereby 602 reusing matrix information to achieve increased efficiency. 603 604 Level: intermediate 605 606 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 607 @*/ 608 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 609 { 610 PetscErrorCode ierr; 611 PetscMPIInt size; 612 613 PetscFunctionBegin; 614 ierr = MatCreate(comm,A);CHKERRQ(ierr); 615 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 616 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 617 if (size > 1) { 618 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 619 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 620 } else { 621 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 622 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 623 } 624 PetscFunctionReturn(0); 625 } 626 627 /*MC 628 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 629 630 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 631 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 632 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 633 634 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 635 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 636 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 637 for communicators controlling multiple processes. It is recommended that you call both of 638 the above preallocation routines for simplicity. 639 640 Options Database Keys: 641 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 642 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 643 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 644 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 645 646 Level: beginner 647 648 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 649 M 650 M*/ 651 652 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 653 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 654 { 655 #if defined(PETSC_USE_CTABLE) 656 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 657 #else 658 PetscSplitCSRDataStructure **p_d_mat; 659 PetscMPIInt size,rank; 660 MPI_Comm comm; 661 PetscErrorCode ierr; 662 int *ai,*bi,*aj,*bj; 663 PetscScalar *aa,*ba; 664 665 PetscFunctionBegin; 666 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 667 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 668 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 669 if (A->factortype == MAT_FACTOR_NONE) { 670 CsrMatrix *matrixA,*matrixB=NULL; 671 if (size == 1) { 672 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 673 p_d_mat = &cusparsestruct->deviceMat; 674 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 675 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 676 matrixA = (CsrMatrix*)matstruct->mat; 677 bi = bj = NULL; ba = NULL; 678 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 679 } else { 680 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 681 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 682 p_d_mat = &spptr->deviceMat; 683 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 684 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 685 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 686 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 687 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 688 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 689 matrixA = (CsrMatrix*)matstructA->mat; 690 matrixB = (CsrMatrix*)matstructB->mat; 691 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 692 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 693 ba = thrust::raw_pointer_cast(matrixB->values->data()); 694 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 695 } 696 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 697 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 698 aa = thrust::raw_pointer_cast(matrixA->values->data()); 699 } else { 700 *B = NULL; 701 PetscFunctionReturn(0); 702 } 703 // act like MatSetValues because not called on host 704 if (A->assembled) { 705 if (A->was_assembled) { 706 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 707 } 708 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 709 } else { 710 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 711 } 712 if (!*p_d_mat) { 713 cudaError_t err; 714 PetscSplitCSRDataStructure *d_mat, h_mat; 715 Mat_SeqAIJ *jaca; 716 PetscInt n = A->rmap->n, nnz; 717 // create and copy 718 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 719 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 720 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 721 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 722 if (size == 1) { 723 jaca = (Mat_SeqAIJ*)A->data; 724 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 725 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 726 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 727 h_mat.offdiag.a = NULL; 728 } else { 729 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 730 Mat_SeqAIJ *jacb; 731 jaca = (Mat_SeqAIJ*)aij->A->data; 732 jacb = (Mat_SeqAIJ*)aij->B->data; 733 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 734 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 735 // create colmap - this is ussually done (lazy) in MatSetValues 736 aij->donotstash = PETSC_TRUE; 737 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 738 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 739 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 740 aij->colmap[A->cmap->N] = -9; 741 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 742 { 743 PetscInt ii; 744 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 745 } 746 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 747 // allocate B copy data 748 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 749 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 750 nnz = jacb->i[n]; 751 752 if (jacb->compressedrow.use) { 753 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 754 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 755 } else h_mat.offdiag.i = bi; 756 h_mat.offdiag.j = bj; 757 h_mat.offdiag.a = ba; 758 759 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 760 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 761 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 762 h_mat.offdiag.n = n; 763 } 764 // allocate A copy data 765 nnz = jaca->i[n]; 766 h_mat.diag.n = n; 767 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 768 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 769 if (jaca->compressedrow.use) { 770 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 771 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 772 } else { 773 h_mat.diag.i = ai; 774 } 775 h_mat.diag.j = aj; 776 h_mat.diag.a = aa; 777 // copy pointers and metdata to device 778 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 779 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 780 } else { 781 *B = *p_d_mat; 782 } 783 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 784 PetscFunctionReturn(0); 785 #endif 786 } 787