1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 29 PetscFunctionBegin; 30 if (cusp->coo_p && v) { 31 thrust::device_ptr<const PetscScalar> d_v; 32 THRUSTARRAY *w = NULL; 33 34 if (isCudaMem(v)) { 35 d_v = thrust::device_pointer_cast(v); 36 } else { 37 w = new THRUSTARRAY(n); 38 w->assign(v,v+n); 39 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 40 d_v = w->data(); 41 } 42 43 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 44 cusp->coo_pw->begin())); 45 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 46 cusp->coo_pw->end())); 47 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 48 thrust::for_each(zibit,zieit,VecCUDAEquals()); 49 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 50 delete w; 51 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 52 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 53 } else { 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 55 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 56 } 57 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 58 A->num_ass++; 59 A->assembled = PETSC_TRUE; 60 A->ass_nonzerostate = A->nonzerostate; 61 A->offloadmask = PETSC_OFFLOAD_GPU; 62 PetscFunctionReturn(0); 63 } 64 65 template <typename Tuple> 66 struct IsNotOffDiagT 67 { 68 PetscInt _cstart,_cend; 69 70 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 71 __host__ __device__ 72 inline bool operator()(Tuple t) 73 { 74 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 75 } 76 }; 77 78 struct IsOffDiag 79 { 80 PetscInt _cstart,_cend; 81 82 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 83 __host__ __device__ 84 inline bool operator() (const PetscInt &c) 85 { 86 return c < _cstart || c >= _cend; 87 } 88 }; 89 90 struct GlobToLoc 91 { 92 PetscInt _start; 93 94 GlobToLoc(PetscInt start) : _start(start) {} 95 __host__ __device__ 96 inline PetscInt operator() (const PetscInt &c) 97 { 98 return c - _start; 99 } 100 }; 101 102 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 103 { 104 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 105 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 106 PetscErrorCode ierr; 107 PetscInt *jj; 108 size_t noff = 0; 109 THRUSTINTARRAY d_i(n); 110 THRUSTINTARRAY d_j(n); 111 ISLocalToGlobalMapping l2g; 112 cudaError_t cerr; 113 114 PetscFunctionBegin; 115 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 116 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 117 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 118 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 119 ierr = PetscFree(b->garray);CHKERRQ(ierr); 120 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 121 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 122 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 123 124 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 125 d_i.assign(coo_i,coo_i+n); 126 d_j.assign(coo_j,coo_j+n); 127 delete cusp->coo_p; 128 delete cusp->coo_pw; 129 cusp->coo_p = NULL; 130 cusp->coo_pw = NULL; 131 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 132 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 133 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 134 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 135 cusp->coo_p = new THRUSTINTARRAY(n); 136 cusp->coo_pw = new THRUSTARRAY(n); 137 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 138 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 139 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 140 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 141 firstoffd = mzipp.get_iterator_tuple().get<1>(); 142 } 143 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 144 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 145 146 /* from global to local */ 147 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 148 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 149 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 150 151 /* copy offdiag column indices to map on the CPU */ 152 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 153 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 154 auto o_j = d_j.begin(); 155 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 156 thrust::advance(o_j,cusp->coo_nd); 157 thrust::sort(thrust::device,o_j,d_j.end()); 158 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 159 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 160 noff = thrust::distance(o_j,wit); 161 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 162 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 163 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 164 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 165 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 166 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 167 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 168 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 169 170 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 171 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 172 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 173 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 174 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 175 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 176 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 177 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 178 179 /* GPU memory, cusparse specific call handles it internally */ 180 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 181 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 182 ierr = PetscFree(jj);CHKERRQ(ierr); 183 184 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 185 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 186 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 187 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 188 /* 189 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 190 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 191 */ 192 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 193 B->preallocated = PETSC_TRUE; 194 B->nonzerostate++; 195 196 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 197 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 198 B->offloadmask = PETSC_OFFLOAD_CPU; 199 B->assembled = PETSC_FALSE; 200 B->was_assembled = PETSC_FALSE; 201 PetscFunctionReturn(0); 202 } 203 204 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 205 { 206 Mat Ad,Ao; 207 const PetscInt *cmap; 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 212 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 213 if (glob) { 214 PetscInt cst, i, dn, on, *gidx; 215 216 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 217 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 218 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 219 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 220 for (i=0; i<dn; i++) gidx[i] = cst + i; 221 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 222 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 223 } 224 PetscFunctionReturn(0); 225 } 226 227 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 228 { 229 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 230 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 231 PetscErrorCode ierr; 232 PetscInt i; 233 234 PetscFunctionBegin; 235 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 236 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 237 if (PetscDefined(USE_DEBUG) && d_nnz) { 238 for (i=0; i<B->rmap->n; i++) { 239 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 240 } 241 } 242 if (PetscDefined(USE_DEBUG) && o_nnz) { 243 for (i=0; i<B->rmap->n; i++) { 244 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 245 } 246 } 247 #if defined(PETSC_USE_CTABLE) 248 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 249 #else 250 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 251 #endif 252 ierr = PetscFree(b->garray);CHKERRQ(ierr); 253 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 254 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 255 /* Because the B will have been resized we simply destroy it and create a new one each time */ 256 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 257 if (!b->A) { 258 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 259 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 260 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 261 } 262 if (!b->B) { 263 PetscMPIInt size; 264 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 265 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 266 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 267 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 268 } 269 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 270 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 271 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 272 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 273 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 274 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 275 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 276 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 277 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 278 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 279 /* Let A, B use b's handle with pre-set stream 280 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 281 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 282 */ 283 B->preallocated = PETSC_TRUE; 284 PetscFunctionReturn(0); 285 } 286 287 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 288 { 289 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 290 PetscErrorCode ierr; 291 PetscInt nt; 292 293 PetscFunctionBegin; 294 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 295 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 296 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 297 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 298 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 299 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 300 PetscFunctionReturn(0); 301 } 302 303 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 304 { 305 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 306 PetscErrorCode ierr; 307 308 PetscFunctionBegin; 309 if (A->factortype == MAT_FACTOR_NONE) { 310 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 311 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 312 if (d_mat) { 313 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 314 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 315 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 316 cudaError_t err; 317 PetscScalar *vals; 318 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 319 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 320 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 321 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 322 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 323 } 324 } 325 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 326 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 327 328 PetscFunctionReturn(0); 329 } 330 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 331 { 332 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 333 PetscErrorCode ierr; 334 PetscInt nt; 335 336 PetscFunctionBegin; 337 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 338 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 339 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 340 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 341 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 342 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 343 PetscFunctionReturn(0); 344 } 345 346 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 347 { 348 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 349 PetscErrorCode ierr; 350 PetscInt nt; 351 352 PetscFunctionBegin; 353 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 354 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 355 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 356 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 357 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 358 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 359 PetscFunctionReturn(0); 360 } 361 362 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 363 { 364 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 365 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 366 367 PetscFunctionBegin; 368 switch (op) { 369 case MAT_CUSPARSE_MULT_DIAG: 370 cusparseStruct->diagGPUMatFormat = format; 371 break; 372 case MAT_CUSPARSE_MULT_OFFDIAG: 373 cusparseStruct->offdiagGPUMatFormat = format; 374 break; 375 case MAT_CUSPARSE_ALL: 376 cusparseStruct->diagGPUMatFormat = format; 377 cusparseStruct->offdiagGPUMatFormat = format; 378 break; 379 default: 380 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 381 } 382 PetscFunctionReturn(0); 383 } 384 385 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 386 { 387 MatCUSPARSEStorageFormat format; 388 PetscErrorCode ierr; 389 PetscBool flg; 390 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 391 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 392 393 PetscFunctionBegin; 394 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 395 if (A->factortype==MAT_FACTOR_NONE) { 396 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 397 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 398 if (flg) { 399 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 400 } 401 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 402 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 403 if (flg) { 404 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 405 } 406 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 407 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 408 if (flg) { 409 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 410 } 411 } 412 ierr = PetscOptionsTail();CHKERRQ(ierr); 413 PetscFunctionReturn(0); 414 } 415 416 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 417 { 418 PetscErrorCode ierr; 419 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 420 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 421 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 422 PetscFunctionBegin; 423 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 424 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 425 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 426 #if defined(PETSC_HAVE_NVSHMEM) 427 { 428 PetscMPIInt result; 429 PetscBool useNvshmem = PETSC_FALSE; 430 ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr); 431 if (useNvshmem) { 432 ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr); 433 if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);} 434 } 435 } 436 #endif 437 } 438 if (d_mat) { 439 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 440 } 441 PetscFunctionReturn(0); 442 } 443 444 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 445 { 446 PetscErrorCode ierr; 447 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 448 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 449 cudaError_t err; 450 cusparseStatus_t stat; 451 452 PetscFunctionBegin; 453 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 454 if (cusparseStruct->deviceMat) { 455 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 456 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 457 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 458 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 459 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 460 if (jaca->compressedrow.use) { 461 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 462 } 463 if (jacb->compressedrow.use) { 464 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 465 } 466 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 467 err = cudaFree(d_mat);CHKERRCUDA(err); 468 } 469 try { 470 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 471 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 472 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 473 /* We want cusparseStruct to use PetscDefaultCudaStream 474 if (cusparseStruct->stream) { 475 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 476 } 477 */ 478 delete cusparseStruct->coo_p; 479 delete cusparseStruct->coo_pw; 480 delete cusparseStruct; 481 } catch(char *ex) { 482 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 483 } 484 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 485 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 486 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 487 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 488 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 489 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 490 PetscFunctionReturn(0); 491 } 492 493 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 494 { 495 PetscErrorCode ierr; 496 Mat_MPIAIJ *a; 497 Mat_MPIAIJCUSPARSE *cusparseStruct; 498 cusparseStatus_t stat; 499 Mat A; 500 501 PetscFunctionBegin; 502 if (reuse == MAT_INITIAL_MATRIX) { 503 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 504 } else if (reuse == MAT_REUSE_MATRIX) { 505 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 506 } 507 A = *newmat; 508 A->boundtocpu = PETSC_FALSE; 509 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 510 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 511 512 a = (Mat_MPIAIJ*)A->data; 513 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 514 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 515 if (a->lvec) { 516 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 517 } 518 519 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 520 a->spptr = new Mat_MPIAIJCUSPARSE; 521 522 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 523 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 524 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 525 cusparseStruct->coo_p = NULL; 526 cusparseStruct->coo_pw = NULL; 527 cusparseStruct->stream = 0; /* We should not need cusparseStruct->stream */ 528 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 529 stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 530 cusparseStruct->deviceMat = NULL; 531 } 532 533 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 534 A->ops->mult = MatMult_MPIAIJCUSPARSE; 535 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 536 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 537 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 538 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 539 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 540 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 541 542 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 543 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 544 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 545 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 546 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 547 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 548 PetscFunctionReturn(0); 549 } 550 551 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 552 { 553 PetscErrorCode ierr; 554 555 PetscFunctionBegin; 556 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 557 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 558 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 559 PetscFunctionReturn(0); 560 } 561 562 /*@ 563 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 564 (the default parallel PETSc format). This matrix will ultimately pushed down 565 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 566 assembly performance the user should preallocate the matrix storage by setting 567 the parameter nz (or the array nnz). By setting these parameters accurately, 568 performance during matrix assembly can be increased by more than a factor of 50. 569 570 Collective 571 572 Input Parameters: 573 + comm - MPI communicator, set to PETSC_COMM_SELF 574 . m - number of rows 575 . n - number of columns 576 . nz - number of nonzeros per row (same for all rows) 577 - nnz - array containing the number of nonzeros in the various rows 578 (possibly different for each row) or NULL 579 580 Output Parameter: 581 . A - the matrix 582 583 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 584 MatXXXXSetPreallocation() paradigm instead of this routine directly. 585 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 586 587 Notes: 588 If nnz is given then nz is ignored 589 590 The AIJ format (also called the Yale sparse matrix format or 591 compressed row storage), is fully compatible with standard Fortran 77 592 storage. That is, the stored row and column indices can begin at 593 either one (as in Fortran) or zero. See the users' manual for details. 594 595 Specify the preallocated storage with either nz or nnz (not both). 596 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 597 allocation. For large problems you MUST preallocate memory or you 598 will get TERRIBLE performance, see the users' manual chapter on matrices. 599 600 By default, this format uses inodes (identical nodes) when possible, to 601 improve numerical efficiency of matrix-vector products and solves. We 602 search for consecutive rows with the same nonzero structure, thereby 603 reusing matrix information to achieve increased efficiency. 604 605 Level: intermediate 606 607 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 608 @*/ 609 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 610 { 611 PetscErrorCode ierr; 612 PetscMPIInt size; 613 614 PetscFunctionBegin; 615 ierr = MatCreate(comm,A);CHKERRQ(ierr); 616 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 617 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 618 if (size > 1) { 619 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 620 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 621 } else { 622 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 623 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 624 } 625 PetscFunctionReturn(0); 626 } 627 628 /*MC 629 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 630 631 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 632 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 633 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 634 635 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 636 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 637 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 638 for communicators controlling multiple processes. It is recommended that you call both of 639 the above preallocation routines for simplicity. 640 641 Options Database Keys: 642 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 643 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 644 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 645 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 646 647 Level: beginner 648 649 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 650 M 651 M*/ 652 653 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 654 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 655 { 656 #if defined(PETSC_USE_CTABLE) 657 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 658 #else 659 PetscSplitCSRDataStructure **p_d_mat; 660 PetscMPIInt size,rank; 661 MPI_Comm comm; 662 PetscErrorCode ierr; 663 int *ai,*bi,*aj,*bj; 664 PetscScalar *aa,*ba; 665 666 PetscFunctionBegin; 667 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 668 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 669 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 670 if (A->factortype == MAT_FACTOR_NONE) { 671 CsrMatrix *matrixA,*matrixB=NULL; 672 if (size == 1) { 673 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 674 p_d_mat = &cusparsestruct->deviceMat; 675 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 676 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 677 matrixA = (CsrMatrix*)matstruct->mat; 678 bi = bj = NULL; ba = NULL; 679 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 680 } else { 681 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 682 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 683 p_d_mat = &spptr->deviceMat; 684 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 685 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 686 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 687 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 688 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 689 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 690 matrixA = (CsrMatrix*)matstructA->mat; 691 matrixB = (CsrMatrix*)matstructB->mat; 692 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 693 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 694 ba = thrust::raw_pointer_cast(matrixB->values->data()); 695 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 696 } 697 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 698 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 699 aa = thrust::raw_pointer_cast(matrixA->values->data()); 700 } else { 701 *B = NULL; 702 PetscFunctionReturn(0); 703 } 704 // act like MatSetValues because not called on host 705 if (A->assembled) { 706 if (A->was_assembled) { 707 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 708 } 709 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 710 } else { 711 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 712 } 713 if (!*p_d_mat) { 714 cudaError_t err; 715 PetscSplitCSRDataStructure *d_mat, h_mat; 716 Mat_SeqAIJ *jaca; 717 PetscInt n = A->rmap->n, nnz; 718 // create and copy 719 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 720 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 721 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 722 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 723 if (size == 1) { 724 jaca = (Mat_SeqAIJ*)A->data; 725 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 726 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 727 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 728 h_mat.offdiag.a = NULL; 729 } else { 730 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 731 Mat_SeqAIJ *jacb; 732 jaca = (Mat_SeqAIJ*)aij->A->data; 733 jacb = (Mat_SeqAIJ*)aij->B->data; 734 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 735 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 736 // create colmap - this is ussually done (lazy) in MatSetValues 737 aij->donotstash = PETSC_TRUE; 738 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 739 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 740 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 741 aij->colmap[A->cmap->N] = -9; 742 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 743 { 744 PetscInt ii; 745 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 746 } 747 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 748 // allocate B copy data 749 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 750 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 751 nnz = jacb->i[n]; 752 753 if (jacb->compressedrow.use) { 754 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 755 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 756 } else h_mat.offdiag.i = bi; 757 h_mat.offdiag.j = bj; 758 h_mat.offdiag.a = ba; 759 760 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 761 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 762 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 763 h_mat.offdiag.n = n; 764 } 765 // allocate A copy data 766 nnz = jaca->i[n]; 767 h_mat.diag.n = n; 768 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 769 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 770 if (jaca->compressedrow.use) { 771 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 772 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 773 } else { 774 h_mat.diag.i = ai; 775 } 776 h_mat.diag.j = aj; 777 h_mat.diag.a = aa; 778 // copy pointers and metdata to device 779 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 780 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 781 } else { 782 *B = *p_d_mat; 783 } 784 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 785 PetscFunctionReturn(0); 786 #endif 787 } 788