1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 cudaError_t cerr; 29 30 PetscFunctionBegin; 31 if (cusp->coo_p && v) { 32 thrust::device_ptr<const PetscScalar> d_v; 33 THRUSTARRAY *w = NULL; 34 35 if (isCudaMem(v)) { 36 d_v = thrust::device_pointer_cast(v); 37 } else { 38 w = new THRUSTARRAY(n); 39 w->assign(v,v+n); 40 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 41 d_v = w->data(); 42 } 43 44 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 45 cusp->coo_pw->begin())); 46 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 47 cusp->coo_pw->end())); 48 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 49 thrust::for_each(zibit,zieit,VecCUDAEquals()); 50 cerr = WaitForCUDA();CHKERRCUDA(cerr); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 60 A->num_ass++; 61 A->assembled = PETSC_TRUE; 62 A->ass_nonzerostate = A->nonzerostate; 63 A->offloadmask = PETSC_OFFLOAD_GPU; 64 PetscFunctionReturn(0); 65 } 66 67 template <typename Tuple> 68 struct IsNotOffDiagT 69 { 70 PetscInt _cstart,_cend; 71 72 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 73 __host__ __device__ 74 inline bool operator()(Tuple t) 75 { 76 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 77 } 78 }; 79 80 struct IsOffDiag 81 { 82 PetscInt _cstart,_cend; 83 84 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 85 __host__ __device__ 86 inline bool operator() (const PetscInt &c) 87 { 88 return c < _cstart || c >= _cend; 89 } 90 }; 91 92 struct GlobToLoc 93 { 94 PetscInt _start; 95 96 GlobToLoc(PetscInt start) : _start(start) {} 97 __host__ __device__ 98 inline PetscInt operator() (const PetscInt &c) 99 { 100 return c - _start; 101 } 102 }; 103 104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 105 { 106 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 107 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 108 PetscErrorCode ierr; 109 PetscInt *jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 cudaError_t cerr; 115 116 PetscFunctionBegin; 117 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 118 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 119 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 120 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 121 ierr = PetscFree(b->garray);CHKERRQ(ierr); 122 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 123 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 124 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 125 126 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 127 d_i.assign(coo_i,coo_i+n); 128 d_j.assign(coo_j,coo_j+n); 129 delete cusp->coo_p; 130 delete cusp->coo_pw; 131 cusp->coo_p = NULL; 132 cusp->coo_pw = NULL; 133 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 134 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 135 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 136 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 137 cusp->coo_p = new THRUSTINTARRAY(n); 138 cusp->coo_pw = new THRUSTARRAY(n); 139 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 140 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 141 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 142 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 143 firstoffd = mzipp.get_iterator_tuple().get<1>(); 144 } 145 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 146 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 147 148 /* from global to local */ 149 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 150 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 151 cerr = WaitForCUDA();CHKERRCUDA(cerr); 152 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 153 154 /* copy offdiag column indices to map on the CPU */ 155 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 156 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 157 auto o_j = d_j.begin(); 158 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 159 thrust::advance(o_j,cusp->coo_nd); 160 thrust::sort(thrust::device,o_j,d_j.end()); 161 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 162 cerr = WaitForCUDA();CHKERRCUDA(cerr); 163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 164 noff = thrust::distance(o_j,wit); 165 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 166 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 167 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 168 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 169 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 170 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 171 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 172 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 173 174 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 175 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 176 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 177 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 178 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 179 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 180 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 181 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 182 183 /* GPU memory, cusparse specific call handles it internally */ 184 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 185 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 186 ierr = PetscFree(jj);CHKERRQ(ierr); 187 188 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 190 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 191 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 192 /* 193 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 194 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 195 */ 196 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 197 B->preallocated = PETSC_TRUE; 198 B->nonzerostate++; 199 200 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 201 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 202 B->offloadmask = PETSC_OFFLOAD_CPU; 203 B->assembled = PETSC_FALSE; 204 B->was_assembled = PETSC_FALSE; 205 PetscFunctionReturn(0); 206 } 207 208 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 209 { 210 Mat Ad,Ao; 211 const PetscInt *cmap; 212 PetscErrorCode ierr; 213 214 PetscFunctionBegin; 215 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 216 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 217 if (glob) { 218 PetscInt cst, i, dn, on, *gidx; 219 220 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 221 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 222 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 223 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 224 for (i=0; i<dn; i++) gidx[i] = cst + i; 225 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 226 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 227 } 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 232 { 233 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 234 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 235 PetscErrorCode ierr; 236 PetscInt i; 237 238 PetscFunctionBegin; 239 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 240 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 241 if (PetscDefined(USE_DEBUG) && d_nnz) { 242 for (i=0; i<B->rmap->n; i++) { 243 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 244 } 245 } 246 if (PetscDefined(USE_DEBUG) && o_nnz) { 247 for (i=0; i<B->rmap->n; i++) { 248 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 249 } 250 } 251 #if defined(PETSC_USE_CTABLE) 252 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 253 #else 254 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 255 #endif 256 ierr = PetscFree(b->garray);CHKERRQ(ierr); 257 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 258 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 259 /* Because the B will have been resized we simply destroy it and create a new one each time */ 260 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 261 if (!b->A) { 262 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 263 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 264 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 265 } 266 if (!b->B) { 267 PetscMPIInt size; 268 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 269 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 270 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 271 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 272 } 273 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 274 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 275 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 276 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 277 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 278 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 279 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 281 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 282 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 283 /* Let A, B use b's handle with pre-set stream 284 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 285 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 286 */ 287 B->preallocated = PETSC_TRUE; 288 PetscFunctionReturn(0); 289 } 290 291 /*@ 292 MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 293 294 Not collective 295 296 Input Parameters: 297 + A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE 298 - gen - the boolean flag 299 300 Level: intermediate 301 302 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE 303 @*/ 304 PetscErrorCode MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen) 305 { 306 PetscErrorCode ierr; 307 PetscBool ismpiaij; 308 309 PetscFunctionBegin; 310 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 311 MatCheckPreallocated(A,1); 312 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr); 313 if (ismpiaij) { 314 Mat A_d,A_o; 315 316 ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr); 317 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr); 318 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr); 319 } else { 320 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr); 321 } 322 PetscFunctionReturn(0); 323 } 324 325 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 326 { 327 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 328 PetscErrorCode ierr; 329 PetscInt nt; 330 331 PetscFunctionBegin; 332 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 333 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 334 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 335 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 336 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 337 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 338 PetscFunctionReturn(0); 339 } 340 341 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 342 { 343 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 344 PetscErrorCode ierr; 345 346 PetscFunctionBegin; 347 if (A->factortype == MAT_FACTOR_NONE) { 348 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 349 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 350 if (d_mat) { 351 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 352 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 353 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 354 cudaError_t err; 355 PetscScalar *vals; 356 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 357 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 358 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 359 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 360 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 361 } 362 } 363 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 364 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 365 366 PetscFunctionReturn(0); 367 } 368 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 369 { 370 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 371 PetscErrorCode ierr; 372 PetscInt nt; 373 374 PetscFunctionBegin; 375 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 376 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 377 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 378 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 379 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 380 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 385 { 386 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 387 PetscErrorCode ierr; 388 PetscInt nt; 389 390 PetscFunctionBegin; 391 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 392 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 393 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 394 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 395 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 396 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 397 PetscFunctionReturn(0); 398 } 399 400 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 401 { 402 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 403 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 404 405 PetscFunctionBegin; 406 switch (op) { 407 case MAT_CUSPARSE_MULT_DIAG: 408 cusparseStruct->diagGPUMatFormat = format; 409 break; 410 case MAT_CUSPARSE_MULT_OFFDIAG: 411 cusparseStruct->offdiagGPUMatFormat = format; 412 break; 413 case MAT_CUSPARSE_ALL: 414 cusparseStruct->diagGPUMatFormat = format; 415 cusparseStruct->offdiagGPUMatFormat = format; 416 break; 417 default: 418 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 419 } 420 PetscFunctionReturn(0); 421 } 422 423 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 424 { 425 MatCUSPARSEStorageFormat format; 426 PetscErrorCode ierr; 427 PetscBool flg; 428 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 429 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 430 431 PetscFunctionBegin; 432 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 433 if (A->factortype==MAT_FACTOR_NONE) { 434 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 435 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 436 if (flg) { 437 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 438 } 439 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 440 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 441 if (flg) { 442 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 443 } 444 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 445 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 446 if (flg) { 447 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 448 } 449 } 450 ierr = PetscOptionsTail();CHKERRQ(ierr); 451 PetscFunctionReturn(0); 452 } 453 454 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 455 { 456 PetscErrorCode ierr; 457 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 458 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 459 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 460 PetscFunctionBegin; 461 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 462 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 463 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 464 } 465 if (d_mat) { 466 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 467 } 468 469 PetscFunctionReturn(0); 470 } 471 472 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 473 { 474 PetscErrorCode ierr; 475 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 476 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 477 cudaError_t err; 478 cusparseStatus_t stat; 479 480 PetscFunctionBegin; 481 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 482 if (cusparseStruct->deviceMat) { 483 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 484 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 485 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 486 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 487 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 488 if (jaca->compressedrow.use) { 489 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 490 } 491 if (jacb->compressedrow.use) { 492 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 493 } 494 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 495 err = cudaFree(d_mat);CHKERRCUDA(err); 496 } 497 try { 498 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 499 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 500 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 501 /* We want cusparseStruct to use PetscDefaultCudaStream 502 if (cusparseStruct->stream) { 503 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 504 } 505 */ 506 delete cusparseStruct->coo_p; 507 delete cusparseStruct->coo_pw; 508 delete cusparseStruct; 509 } catch(char *ex) { 510 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 511 } 512 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 513 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 514 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 515 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 516 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 517 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 518 PetscFunctionReturn(0); 519 } 520 521 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 522 { 523 PetscErrorCode ierr; 524 Mat_MPIAIJ *a; 525 Mat_MPIAIJCUSPARSE *cusparseStruct; 526 cusparseStatus_t stat; 527 Mat A; 528 529 PetscFunctionBegin; 530 if (reuse == MAT_INITIAL_MATRIX) { 531 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 532 } else if (reuse == MAT_REUSE_MATRIX) { 533 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 534 } 535 A = *newmat; 536 A->boundtocpu = PETSC_FALSE; 537 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 538 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 539 540 a = (Mat_MPIAIJ*)A->data; 541 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 542 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 543 if (a->lvec) { 544 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 545 } 546 547 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 548 a->spptr = new Mat_MPIAIJCUSPARSE; 549 550 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 551 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 552 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 553 cusparseStruct->coo_p = NULL; 554 cusparseStruct->coo_pw = NULL; 555 cusparseStruct->stream = 0; /* We should not need cusparseStruct->stream */ 556 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 557 stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 558 cusparseStruct->deviceMat = NULL; 559 } 560 561 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 562 A->ops->mult = MatMult_MPIAIJCUSPARSE; 563 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 564 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 565 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 566 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 567 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 568 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 569 570 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 571 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 572 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 573 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 574 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 575 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 579 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 580 { 581 PetscErrorCode ierr; 582 583 PetscFunctionBegin; 584 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 585 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 586 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 587 PetscFunctionReturn(0); 588 } 589 590 /*@ 591 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 592 (the default parallel PETSc format). This matrix will ultimately pushed down 593 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 594 assembly performance the user should preallocate the matrix storage by setting 595 the parameter nz (or the array nnz). By setting these parameters accurately, 596 performance during matrix assembly can be increased by more than a factor of 50. 597 598 Collective 599 600 Input Parameters: 601 + comm - MPI communicator, set to PETSC_COMM_SELF 602 . m - number of rows 603 . n - number of columns 604 . nz - number of nonzeros per row (same for all rows) 605 - nnz - array containing the number of nonzeros in the various rows 606 (possibly different for each row) or NULL 607 608 Output Parameter: 609 . A - the matrix 610 611 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 612 MatXXXXSetPreallocation() paradigm instead of this routine directly. 613 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 614 615 Notes: 616 If nnz is given then nz is ignored 617 618 The AIJ format (also called the Yale sparse matrix format or 619 compressed row storage), is fully compatible with standard Fortran 77 620 storage. That is, the stored row and column indices can begin at 621 either one (as in Fortran) or zero. See the users' manual for details. 622 623 Specify the preallocated storage with either nz or nnz (not both). 624 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 625 allocation. For large problems you MUST preallocate memory or you 626 will get TERRIBLE performance, see the users' manual chapter on matrices. 627 628 By default, this format uses inodes (identical nodes) when possible, to 629 improve numerical efficiency of matrix-vector products and solves. We 630 search for consecutive rows with the same nonzero structure, thereby 631 reusing matrix information to achieve increased efficiency. 632 633 Level: intermediate 634 635 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 636 @*/ 637 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 638 { 639 PetscErrorCode ierr; 640 PetscMPIInt size; 641 642 PetscFunctionBegin; 643 ierr = MatCreate(comm,A);CHKERRQ(ierr); 644 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 645 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 646 if (size > 1) { 647 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 648 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 649 } else { 650 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 651 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 652 } 653 PetscFunctionReturn(0); 654 } 655 656 /*MC 657 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 658 659 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 660 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 661 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 662 663 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 664 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 665 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 666 for communicators controlling multiple processes. It is recommended that you call both of 667 the above preallocation routines for simplicity. 668 669 Options Database Keys: 670 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 671 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 672 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 673 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 674 675 Level: beginner 676 677 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 678 M 679 M*/ 680 681 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 682 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 683 { 684 #if defined(PETSC_USE_CTABLE) 685 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 686 #else 687 PetscSplitCSRDataStructure **p_d_mat; 688 PetscMPIInt size,rank; 689 MPI_Comm comm; 690 PetscErrorCode ierr; 691 int *ai,*bi,*aj,*bj; 692 PetscScalar *aa,*ba; 693 694 PetscFunctionBegin; 695 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 696 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 697 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 698 if (A->factortype == MAT_FACTOR_NONE) { 699 CsrMatrix *matrixA,*matrixB=NULL; 700 if (size == 1) { 701 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 702 p_d_mat = &cusparsestruct->deviceMat; 703 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 704 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 705 matrixA = (CsrMatrix*)matstruct->mat; 706 bi = bj = NULL; ba = NULL; 707 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 708 } else { 709 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 710 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 711 p_d_mat = &spptr->deviceMat; 712 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 713 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 714 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 715 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 716 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 717 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 718 matrixA = (CsrMatrix*)matstructA->mat; 719 matrixB = (CsrMatrix*)matstructB->mat; 720 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 721 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 722 ba = thrust::raw_pointer_cast(matrixB->values->data()); 723 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 724 } 725 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 726 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 727 aa = thrust::raw_pointer_cast(matrixA->values->data()); 728 } else { 729 *B = NULL; 730 PetscFunctionReturn(0); 731 } 732 // act like MatSetValues because not called on host 733 if (A->assembled) { 734 if (A->was_assembled) { 735 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 736 } 737 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 738 } else { 739 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 740 } 741 if (!*p_d_mat) { 742 cudaError_t err; 743 PetscSplitCSRDataStructure *d_mat, h_mat; 744 Mat_SeqAIJ *jaca; 745 PetscInt n = A->rmap->n, nnz; 746 // create and copy 747 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 748 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 749 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 750 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 751 if (size == 1) { 752 jaca = (Mat_SeqAIJ*)A->data; 753 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 754 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 755 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 756 h_mat.offdiag.a = NULL; 757 } else { 758 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 759 Mat_SeqAIJ *jacb; 760 jaca = (Mat_SeqAIJ*)aij->A->data; 761 jacb = (Mat_SeqAIJ*)aij->B->data; 762 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 763 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 764 // create colmap - this is ussually done (lazy) in MatSetValues 765 aij->donotstash = PETSC_TRUE; 766 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 767 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 768 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 769 aij->colmap[A->cmap->N] = -9; 770 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 771 { 772 PetscInt ii; 773 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 774 } 775 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 776 // allocate B copy data 777 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 778 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 779 nnz = jacb->i[n]; 780 781 if (jacb->compressedrow.use) { 782 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 783 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 784 } else h_mat.offdiag.i = bi; 785 h_mat.offdiag.j = bj; 786 h_mat.offdiag.a = ba; 787 788 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 789 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 790 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 791 h_mat.offdiag.n = n; 792 } 793 // allocate A copy data 794 nnz = jaca->i[n]; 795 h_mat.diag.n = n; 796 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 797 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 798 if (jaca->compressedrow.use) { 799 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 800 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 801 } else { 802 h_mat.diag.i = ai; 803 } 804 h_mat.diag.j = aj; 805 h_mat.diag.a = aa; 806 // copy pointers and metdata to device 807 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 808 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 809 } else { 810 *B = *p_d_mat; 811 } 812 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 813 PetscFunctionReturn(0); 814 #endif 815 } 816