1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 cudaError_t cerr; 29 30 PetscFunctionBegin; 31 if (cusp->coo_p && v) { 32 thrust::device_ptr<const PetscScalar> d_v; 33 THRUSTARRAY *w = NULL; 34 35 if (isCudaMem(v)) { 36 d_v = thrust::device_pointer_cast(v); 37 } else { 38 w = new THRUSTARRAY(n); 39 w->assign(v,v+n); 40 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 41 d_v = w->data(); 42 } 43 44 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 45 cusp->coo_pw->begin())); 46 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 47 cusp->coo_pw->end())); 48 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 49 thrust::for_each(zibit,zieit,VecCUDAEquals()); 50 cerr = WaitForCUDA();CHKERRCUDA(cerr); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 60 A->num_ass++; 61 A->assembled = PETSC_TRUE; 62 A->ass_nonzerostate = A->nonzerostate; 63 A->offloadmask = PETSC_OFFLOAD_GPU; 64 PetscFunctionReturn(0); 65 } 66 67 template <typename Tuple> 68 struct IsNotOffDiagT 69 { 70 PetscInt _cstart,_cend; 71 72 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 73 __host__ __device__ 74 inline bool operator()(Tuple t) 75 { 76 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 77 } 78 }; 79 80 struct IsOffDiag 81 { 82 PetscInt _cstart,_cend; 83 84 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 85 __host__ __device__ 86 inline bool operator() (const PetscInt &c) 87 { 88 return c < _cstart || c >= _cend; 89 } 90 }; 91 92 struct GlobToLoc 93 { 94 PetscInt _start; 95 96 GlobToLoc(PetscInt start) : _start(start) {} 97 __host__ __device__ 98 inline PetscInt operator() (const PetscInt &c) 99 { 100 return c - _start; 101 } 102 }; 103 104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 105 { 106 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 107 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 108 PetscErrorCode ierr; 109 PetscInt *jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 cudaError_t cerr; 115 116 PetscFunctionBegin; 117 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 118 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 119 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 120 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 121 ierr = PetscFree(b->garray);CHKERRQ(ierr); 122 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 123 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 124 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 125 126 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 127 d_i.assign(coo_i,coo_i+n); 128 d_j.assign(coo_j,coo_j+n); 129 delete cusp->coo_p; 130 delete cusp->coo_pw; 131 cusp->coo_p = NULL; 132 cusp->coo_pw = NULL; 133 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 134 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 135 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 136 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 137 cusp->coo_p = new THRUSTINTARRAY(n); 138 cusp->coo_pw = new THRUSTARRAY(n); 139 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 140 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 141 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 142 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 143 firstoffd = mzipp.get_iterator_tuple().get<1>(); 144 } 145 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 146 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 147 148 /* from global to local */ 149 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 150 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 151 cerr = WaitForCUDA();CHKERRCUDA(cerr); 152 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 153 154 /* copy offdiag column indices to map on the CPU */ 155 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 156 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 157 auto o_j = d_j.begin(); 158 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 159 thrust::advance(o_j,cusp->coo_nd); 160 thrust::sort(thrust::device,o_j,d_j.end()); 161 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 162 cerr = WaitForCUDA();CHKERRCUDA(cerr); 163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 164 noff = thrust::distance(o_j,wit); 165 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 166 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 167 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 168 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 169 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 170 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 171 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 172 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 173 174 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 175 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 176 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 177 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 178 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 179 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 180 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 181 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 182 183 /* GPU memory, cusparse specific call handles it internally */ 184 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 185 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 186 ierr = PetscFree(jj);CHKERRQ(ierr); 187 188 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 189 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 190 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 191 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 192 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 193 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 194 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 195 B->preallocated = PETSC_TRUE; 196 B->nonzerostate++; 197 198 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 199 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 200 B->offloadmask = PETSC_OFFLOAD_CPU; 201 B->assembled = PETSC_FALSE; 202 B->was_assembled = PETSC_FALSE; 203 PetscFunctionReturn(0); 204 } 205 206 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 207 { 208 Mat Ad,Ao; 209 const PetscInt *cmap; 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 214 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 215 if (glob) { 216 PetscInt cst, i, dn, on, *gidx; 217 218 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 219 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 220 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 221 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 222 for (i=0; i<dn; i++) gidx[i] = cst + i; 223 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 224 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 225 } 226 PetscFunctionReturn(0); 227 } 228 229 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 230 { 231 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 232 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 233 PetscErrorCode ierr; 234 PetscInt i; 235 236 PetscFunctionBegin; 237 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 238 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 239 if (PetscDefined(USE_DEBUG) && d_nnz) { 240 for (i=0; i<B->rmap->n; i++) { 241 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 242 } 243 } 244 if (PetscDefined(USE_DEBUG) && o_nnz) { 245 for (i=0; i<B->rmap->n; i++) { 246 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 247 } 248 } 249 #if defined(PETSC_USE_CTABLE) 250 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 251 #else 252 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 253 #endif 254 ierr = PetscFree(b->garray);CHKERRQ(ierr); 255 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 256 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 257 /* Because the B will have been resized we simply destroy it and create a new one each time */ 258 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 259 if (!b->A) { 260 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 261 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 262 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 263 } 264 if (!b->B) { 265 PetscMPIInt size; 266 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr); 267 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 268 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 269 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 270 } 271 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 272 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 273 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 274 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 275 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 276 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 277 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 278 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 279 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 280 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 281 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 282 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 283 284 B->preallocated = PETSC_TRUE; 285 PetscFunctionReturn(0); 286 } 287 288 /*@ 289 MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 290 291 Not collective 292 293 Input Parameters: 294 + A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE 295 - gen - the boolean flag 296 297 Level: intermediate 298 299 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE 300 @*/ 301 PetscErrorCode MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen) 302 { 303 PetscErrorCode ierr; 304 PetscBool ismpiaij; 305 306 PetscFunctionBegin; 307 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 308 MatCheckPreallocated(A,1); 309 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr); 310 if (ismpiaij) { 311 Mat A_d,A_o; 312 313 ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr); 314 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr); 315 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr); 316 } else { 317 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr); 318 } 319 PetscFunctionReturn(0); 320 } 321 322 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 323 { 324 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 325 PetscErrorCode ierr; 326 PetscInt nt; 327 328 PetscFunctionBegin; 329 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 330 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 331 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 332 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 333 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 334 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 335 PetscFunctionReturn(0); 336 } 337 338 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 339 { 340 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 341 PetscErrorCode ierr; 342 343 PetscFunctionBegin; 344 if (A->factortype == MAT_FACTOR_NONE) { 345 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 346 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 347 if (d_mat) { 348 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 349 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 350 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 351 cudaError_t err; 352 PetscScalar *vals; 353 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 354 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 355 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 356 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 357 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 358 } 359 } 360 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 361 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 362 363 PetscFunctionReturn(0); 364 } 365 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 366 { 367 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 368 PetscErrorCode ierr; 369 PetscInt nt; 370 371 PetscFunctionBegin; 372 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 373 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 374 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 375 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 376 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 377 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 378 PetscFunctionReturn(0); 379 } 380 381 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 382 { 383 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 384 PetscErrorCode ierr; 385 PetscInt nt; 386 387 PetscFunctionBegin; 388 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 389 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 390 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 391 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 392 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 393 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 394 PetscFunctionReturn(0); 395 } 396 397 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 398 { 399 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 400 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 401 402 PetscFunctionBegin; 403 switch (op) { 404 case MAT_CUSPARSE_MULT_DIAG: 405 cusparseStruct->diagGPUMatFormat = format; 406 break; 407 case MAT_CUSPARSE_MULT_OFFDIAG: 408 cusparseStruct->offdiagGPUMatFormat = format; 409 break; 410 case MAT_CUSPARSE_ALL: 411 cusparseStruct->diagGPUMatFormat = format; 412 cusparseStruct->offdiagGPUMatFormat = format; 413 break; 414 default: 415 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 416 } 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 421 { 422 MatCUSPARSEStorageFormat format; 423 PetscErrorCode ierr; 424 PetscBool flg; 425 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 426 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 427 428 PetscFunctionBegin; 429 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 430 if (A->factortype==MAT_FACTOR_NONE) { 431 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 432 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 433 if (flg) { 434 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 435 } 436 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 437 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 438 if (flg) { 439 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 440 } 441 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 442 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 443 if (flg) { 444 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 445 } 446 } 447 ierr = PetscOptionsTail();CHKERRQ(ierr); 448 PetscFunctionReturn(0); 449 } 450 451 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 452 { 453 PetscErrorCode ierr; 454 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 455 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 456 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 457 PetscFunctionBegin; 458 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 459 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 460 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 461 } 462 if (d_mat) { 463 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 464 } 465 466 PetscFunctionReturn(0); 467 } 468 469 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 470 { 471 PetscErrorCode ierr; 472 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 473 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 474 cudaError_t err; 475 cusparseStatus_t stat; 476 477 PetscFunctionBegin; 478 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 479 if (cusparseStruct->deviceMat) { 480 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 481 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 482 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 483 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 484 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 485 if (jaca->compressedrow.use) { 486 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 487 } 488 if (jacb->compressedrow.use) { 489 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 490 } 491 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 492 err = cudaFree(d_mat);CHKERRCUDA(err); 493 } 494 try { 495 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 496 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 497 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 498 if (cusparseStruct->stream) { 499 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 500 } 501 delete cusparseStruct->coo_p; 502 delete cusparseStruct->coo_pw; 503 delete cusparseStruct; 504 } catch(char *ex) { 505 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 506 } 507 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 508 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 509 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 510 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 511 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 512 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 513 PetscFunctionReturn(0); 514 } 515 516 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 517 { 518 PetscErrorCode ierr; 519 Mat_MPIAIJ *a; 520 Mat_MPIAIJCUSPARSE *cusparseStruct; 521 cusparseStatus_t stat; 522 Mat A; 523 524 PetscFunctionBegin; 525 if (reuse == MAT_INITIAL_MATRIX) { 526 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 527 } else if (reuse == MAT_REUSE_MATRIX) { 528 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 529 } 530 A = *newmat; 531 A->boundtocpu = PETSC_FALSE; 532 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 533 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 534 535 a = (Mat_MPIAIJ*)A->data; 536 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 537 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 538 if (a->lvec) { 539 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 540 } 541 542 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 543 a->spptr = new Mat_MPIAIJCUSPARSE; 544 545 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 546 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 547 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 548 cusparseStruct->coo_p = NULL; 549 cusparseStruct->coo_pw = NULL; 550 cusparseStruct->stream = 0; 551 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 552 cusparseStruct->deviceMat = NULL; 553 } 554 555 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 556 A->ops->mult = MatMult_MPIAIJCUSPARSE; 557 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 558 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 559 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 560 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 561 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 562 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 563 564 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 565 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 566 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 567 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 568 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 569 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 570 PetscFunctionReturn(0); 571 } 572 573 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 574 { 575 PetscErrorCode ierr; 576 577 PetscFunctionBegin; 578 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 579 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 580 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 581 PetscFunctionReturn(0); 582 } 583 584 /*@ 585 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 586 (the default parallel PETSc format). This matrix will ultimately pushed down 587 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 588 assembly performance the user should preallocate the matrix storage by setting 589 the parameter nz (or the array nnz). By setting these parameters accurately, 590 performance during matrix assembly can be increased by more than a factor of 50. 591 592 Collective 593 594 Input Parameters: 595 + comm - MPI communicator, set to PETSC_COMM_SELF 596 . m - number of rows 597 . n - number of columns 598 . nz - number of nonzeros per row (same for all rows) 599 - nnz - array containing the number of nonzeros in the various rows 600 (possibly different for each row) or NULL 601 602 Output Parameter: 603 . A - the matrix 604 605 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 606 MatXXXXSetPreallocation() paradigm instead of this routine directly. 607 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 608 609 Notes: 610 If nnz is given then nz is ignored 611 612 The AIJ format (also called the Yale sparse matrix format or 613 compressed row storage), is fully compatible with standard Fortran 77 614 storage. That is, the stored row and column indices can begin at 615 either one (as in Fortran) or zero. See the users' manual for details. 616 617 Specify the preallocated storage with either nz or nnz (not both). 618 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 619 allocation. For large problems you MUST preallocate memory or you 620 will get TERRIBLE performance, see the users' manual chapter on matrices. 621 622 By default, this format uses inodes (identical nodes) when possible, to 623 improve numerical efficiency of matrix-vector products and solves. We 624 search for consecutive rows with the same nonzero structure, thereby 625 reusing matrix information to achieve increased efficiency. 626 627 Level: intermediate 628 629 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 630 @*/ 631 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 632 { 633 PetscErrorCode ierr; 634 PetscMPIInt size; 635 636 PetscFunctionBegin; 637 ierr = MatCreate(comm,A);CHKERRQ(ierr); 638 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 639 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 640 if (size > 1) { 641 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 642 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 643 } else { 644 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 645 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 646 } 647 PetscFunctionReturn(0); 648 } 649 650 /*MC 651 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 652 653 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 654 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 655 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 656 657 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 658 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 659 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 660 for communicators controlling multiple processes. It is recommended that you call both of 661 the above preallocation routines for simplicity. 662 663 Options Database Keys: 664 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 665 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 666 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 667 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 668 669 Level: beginner 670 671 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 672 M 673 M*/ 674 675 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 676 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 677 { 678 #if defined(PETSC_USE_CTABLE) 679 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 680 #else 681 PetscSplitCSRDataStructure **p_d_mat; 682 PetscMPIInt size,rank; 683 MPI_Comm comm; 684 PetscErrorCode ierr; 685 int *ai,*bi,*aj,*bj; 686 PetscScalar *aa,*ba; 687 688 PetscFunctionBegin; 689 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 690 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 691 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 692 if (A->factortype == MAT_FACTOR_NONE) { 693 CsrMatrix *matrixA,*matrixB=NULL; 694 if (size == 1) { 695 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 696 p_d_mat = &cusparsestruct->deviceMat; 697 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 698 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 699 matrixA = (CsrMatrix*)matstruct->mat; 700 bi = bj = NULL; ba = NULL; 701 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 702 } else { 703 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 704 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 705 p_d_mat = &spptr->deviceMat; 706 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 707 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 708 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 709 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 710 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 711 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 712 matrixA = (CsrMatrix*)matstructA->mat; 713 matrixB = (CsrMatrix*)matstructB->mat; 714 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 715 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 716 ba = thrust::raw_pointer_cast(matrixB->values->data()); 717 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 718 } 719 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 720 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 721 aa = thrust::raw_pointer_cast(matrixA->values->data()); 722 } else { 723 *B = NULL; 724 PetscFunctionReturn(0); 725 } 726 // act like MatSetValues because not called on host 727 if (A->assembled) { 728 if (A->was_assembled) { 729 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 730 } 731 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 732 } else { 733 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 734 } 735 if (!*p_d_mat) { 736 cudaError_t err; 737 PetscSplitCSRDataStructure *d_mat, h_mat; 738 Mat_SeqAIJ *jaca; 739 PetscInt n = A->rmap->n, nnz; 740 // create and copy 741 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 742 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 743 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 744 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 745 if (size == 1) { 746 jaca = (Mat_SeqAIJ*)A->data; 747 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 748 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 749 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 750 h_mat.offdiag.a = NULL; 751 } else { 752 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 753 Mat_SeqAIJ *jacb; 754 jaca = (Mat_SeqAIJ*)aij->A->data; 755 jacb = (Mat_SeqAIJ*)aij->B->data; 756 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 757 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 758 // create colmap - this is ussually done (lazy) in MatSetValues 759 aij->donotstash = PETSC_TRUE; 760 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 761 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 762 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 763 aij->colmap[A->cmap->N] = -9; 764 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 765 { 766 PetscInt ii; 767 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 768 } 769 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 770 // allocate B copy data 771 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 772 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 773 nnz = jacb->i[n]; 774 775 if (jacb->compressedrow.use) { 776 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 777 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 778 } else h_mat.offdiag.i = bi; 779 h_mat.offdiag.j = bj; 780 h_mat.offdiag.a = ba; 781 782 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 783 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 784 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 785 h_mat.offdiag.n = n; 786 } 787 // allocate A copy data 788 nnz = jaca->i[n]; 789 h_mat.diag.n = n; 790 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 791 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 792 if (jaca->compressedrow.use) { 793 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 794 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 795 } else { 796 h_mat.diag.i = ai; 797 } 798 h_mat.diag.j = aj; 799 h_mat.diag.a = aa; 800 // copy pointers and metdata to device 801 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 802 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 803 } else { 804 *B = *p_d_mat; 805 } 806 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 807 PetscFunctionReturn(0); 808 #endif 809 } 810