1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 #include <thrust/advance.h> 10 #include <petscsf.h> 11 12 struct VecCUDAEquals 13 { 14 template <typename Tuple> 15 __host__ __device__ 16 void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 23 { 24 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 25 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 26 PetscInt n = cusp->coo_nd + cusp->coo_no; 27 PetscErrorCode ierr; 28 cudaError_t cerr; 29 30 PetscFunctionBegin; 31 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 32 if (cusp->coo_p && v) { 33 thrust::device_ptr<const PetscScalar> d_v; 34 THRUSTARRAY *w = NULL; 35 36 if (isCudaMem(v)) { 37 d_v = thrust::device_pointer_cast(v); 38 } else { 39 w = new THRUSTARRAY(n); 40 w->assign(v,v+n); 41 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 42 d_v = w->data(); 43 } 44 45 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 46 cusp->coo_pw->begin())); 47 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 48 cusp->coo_pw->end())); 49 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 50 thrust::for_each(zibit,zieit,VecCUDAEquals()); 51 cerr = WaitForCUDA();CHKERRCUDA(cerr); 52 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 53 delete w; 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 55 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 56 } else { 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 58 ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 59 } 60 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 61 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 62 A->num_ass++; 63 A->assembled = PETSC_TRUE; 64 A->ass_nonzerostate = A->nonzerostate; 65 A->offloadmask = PETSC_OFFLOAD_GPU; 66 PetscFunctionReturn(0); 67 } 68 69 template <typename Tuple> 70 struct IsNotOffDiagT 71 { 72 PetscInt _cstart,_cend; 73 74 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 75 __host__ __device__ 76 inline bool operator()(Tuple t) 77 { 78 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 79 } 80 }; 81 82 struct IsOffDiag 83 { 84 PetscInt _cstart,_cend; 85 86 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 87 __host__ __device__ 88 inline bool operator() (const PetscInt &c) 89 { 90 return c < _cstart || c >= _cend; 91 } 92 }; 93 94 struct GlobToLoc 95 { 96 PetscInt _start; 97 98 GlobToLoc(PetscInt start) : _start(start) {} 99 __host__ __device__ 100 inline PetscInt operator() (const PetscInt &c) 101 { 102 return c - _start; 103 } 104 }; 105 106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 107 { 108 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 109 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 110 PetscErrorCode ierr; 111 PetscInt *jj; 112 size_t noff = 0; 113 THRUSTINTARRAY d_i(n); 114 THRUSTINTARRAY d_j(n); 115 ISLocalToGlobalMapping l2g; 116 cudaError_t cerr; 117 118 PetscFunctionBegin; 119 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 120 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 121 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 122 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 123 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 124 ierr = PetscFree(b->garray);CHKERRQ(ierr); 125 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 126 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 127 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 128 129 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 130 d_i.assign(coo_i,coo_i+n); 131 d_j.assign(coo_j,coo_j+n); 132 delete cusp->coo_p; 133 delete cusp->coo_pw; 134 cusp->coo_p = NULL; 135 cusp->coo_pw = NULL; 136 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 137 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 138 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 139 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 140 cusp->coo_p = new THRUSTINTARRAY(n); 141 cusp->coo_pw = new THRUSTARRAY(n); 142 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 143 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 144 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 145 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 146 firstoffd = mzipp.get_iterator_tuple().get<1>(); 147 } 148 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 149 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 150 151 /* from global to local */ 152 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 153 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 154 cerr = WaitForCUDA();CHKERRCUDA(cerr); 155 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 156 157 /* copy offdiag column indices to map on the CPU */ 158 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 159 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 160 auto o_j = d_j.begin(); 161 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 162 thrust::advance(o_j,cusp->coo_nd); 163 thrust::sort(thrust::device,o_j,d_j.end()); 164 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 165 cerr = WaitForCUDA();CHKERRCUDA(cerr); 166 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 167 noff = thrust::distance(o_j,wit); 168 ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 169 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 170 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 171 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 172 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 173 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 174 if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 175 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 176 177 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 178 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 179 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 180 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 181 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 182 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 183 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 184 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 185 186 /* GPU memory, cusparse specific call handles it internally */ 187 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 188 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 189 ierr = PetscFree(jj);CHKERRQ(ierr); 190 191 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 192 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 193 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 194 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 195 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 196 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 197 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 198 B->preallocated = PETSC_TRUE; 199 B->nonzerostate++; 200 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 201 202 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 203 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 204 B->offloadmask = PETSC_OFFLOAD_CPU; 205 B->assembled = PETSC_FALSE; 206 B->was_assembled = PETSC_FALSE; 207 PetscFunctionReturn(0); 208 } 209 210 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 211 { 212 Mat Ad,Ao; 213 const PetscInt *cmap; 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 218 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 219 if (glob) { 220 PetscInt cst, i, dn, on, *gidx; 221 222 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 223 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 224 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 225 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 226 for (i=0; i<dn; i++) gidx[i] = cst + i; 227 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 228 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 229 } 230 PetscFunctionReturn(0); 231 } 232 233 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 234 { 235 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 236 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 237 PetscErrorCode ierr; 238 PetscInt i; 239 240 PetscFunctionBegin; 241 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 242 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 243 if (PetscDefined(USE_DEBUG) && d_nnz) { 244 for (i=0; i<B->rmap->n; i++) { 245 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 246 } 247 } 248 if (PetscDefined(USE_DEBUG) && o_nnz) { 249 for (i=0; i<B->rmap->n; i++) { 250 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 251 } 252 } 253 #if defined(PETSC_USE_CTABLE) 254 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 255 #else 256 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 257 #endif 258 ierr = PetscFree(b->garray);CHKERRQ(ierr); 259 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 260 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 261 /* Because the B will have been resized we simply destroy it and create a new one each time */ 262 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 263 if (!b->A) { 264 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 265 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 266 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 267 } 268 if (!b->B) { 269 PetscMPIInt size; 270 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr); 271 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 272 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 273 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 274 } 275 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 276 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 277 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 278 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 279 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 280 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 281 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 282 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 283 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 284 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 285 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 286 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 287 288 B->preallocated = PETSC_TRUE; 289 PetscFunctionReturn(0); 290 } 291 292 /*@ 293 MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 294 295 Not collective 296 297 Input Parameters: 298 + A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE 299 - gen - the boolean flag 300 301 Level: intermediate 302 303 .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE 304 @*/ 305 PetscErrorCode MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen) 306 { 307 PetscErrorCode ierr; 308 PetscBool ismpiaij; 309 310 PetscFunctionBegin; 311 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 312 MatCheckPreallocated(A,1); 313 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr); 314 if (ismpiaij) { 315 Mat A_d,A_o; 316 317 ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr); 318 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr); 319 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr); 320 } else { 321 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr); 322 } 323 PetscFunctionReturn(0); 324 } 325 326 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 327 { 328 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 329 PetscErrorCode ierr; 330 PetscInt nt; 331 332 PetscFunctionBegin; 333 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 334 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 335 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 336 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 337 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 338 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 339 PetscFunctionReturn(0); 340 } 341 342 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 343 { 344 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 345 PetscErrorCode ierr; 346 347 PetscFunctionBegin; 348 if (A->factortype == MAT_FACTOR_NONE) { 349 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 350 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 351 if (d_mat) { 352 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 353 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 354 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 355 cudaError_t err; 356 PetscScalar *vals; 357 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 358 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 359 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 360 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 361 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 362 } 363 } 364 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 365 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 366 367 PetscFunctionReturn(0); 368 } 369 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 370 { 371 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 372 PetscErrorCode ierr; 373 PetscInt nt; 374 375 PetscFunctionBegin; 376 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 377 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 378 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 379 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 380 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 381 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 382 PetscFunctionReturn(0); 383 } 384 385 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 386 { 387 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 388 PetscErrorCode ierr; 389 PetscInt nt; 390 391 PetscFunctionBegin; 392 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 393 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 394 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 395 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 396 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 397 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 398 PetscFunctionReturn(0); 399 } 400 401 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 402 { 403 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 404 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 405 406 PetscFunctionBegin; 407 switch (op) { 408 case MAT_CUSPARSE_MULT_DIAG: 409 cusparseStruct->diagGPUMatFormat = format; 410 break; 411 case MAT_CUSPARSE_MULT_OFFDIAG: 412 cusparseStruct->offdiagGPUMatFormat = format; 413 break; 414 case MAT_CUSPARSE_ALL: 415 cusparseStruct->diagGPUMatFormat = format; 416 cusparseStruct->offdiagGPUMatFormat = format; 417 break; 418 default: 419 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 420 } 421 PetscFunctionReturn(0); 422 } 423 424 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 425 { 426 MatCUSPARSEStorageFormat format; 427 PetscErrorCode ierr; 428 PetscBool flg; 429 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 430 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 431 432 PetscFunctionBegin; 433 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 434 if (A->factortype==MAT_FACTOR_NONE) { 435 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 436 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 437 if (flg) { 438 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 439 } 440 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 441 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 442 if (flg) { 443 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 444 } 445 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 446 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 447 if (flg) { 448 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 449 } 450 } 451 ierr = PetscOptionsTail();CHKERRQ(ierr); 452 PetscFunctionReturn(0); 453 } 454 455 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 456 { 457 PetscErrorCode ierr; 458 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 459 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 460 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 461 PetscFunctionBegin; 462 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 463 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 464 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 465 } 466 if (d_mat) { 467 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 468 } 469 470 PetscFunctionReturn(0); 471 } 472 473 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 474 { 475 PetscErrorCode ierr; 476 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 477 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 478 cudaError_t err; 479 cusparseStatus_t stat; 480 481 PetscFunctionBegin; 482 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 483 if (cusparseStruct->deviceMat) { 484 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 485 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 486 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 487 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 488 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 489 if (jaca->compressedrow.use) { 490 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 491 } 492 if (jacb->compressedrow.use) { 493 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 494 } 495 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 496 err = cudaFree(d_mat);CHKERRCUDA(err); 497 } 498 try { 499 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 500 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 501 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 502 if (cusparseStruct->stream) { 503 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 504 } 505 delete cusparseStruct->coo_p; 506 delete cusparseStruct->coo_pw; 507 delete cusparseStruct; 508 } catch(char *ex) { 509 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 510 } 511 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 512 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 513 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 514 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 515 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 516 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 517 PetscFunctionReturn(0); 518 } 519 520 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 521 { 522 PetscErrorCode ierr; 523 Mat_MPIAIJ *a; 524 Mat_MPIAIJCUSPARSE *cusparseStruct; 525 cusparseStatus_t stat; 526 Mat A; 527 528 PetscFunctionBegin; 529 if (reuse == MAT_INITIAL_MATRIX) { 530 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 531 } else if (reuse == MAT_REUSE_MATRIX) { 532 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 533 } 534 A = *newmat; 535 536 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 537 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 538 539 a = (Mat_MPIAIJ*)A->data; 540 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 541 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 542 if (a->lvec) { 543 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 544 } 545 546 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 547 a->spptr = new Mat_MPIAIJCUSPARSE; 548 549 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 550 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 551 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 552 cusparseStruct->coo_p = NULL; 553 cusparseStruct->coo_pw = NULL; 554 cusparseStruct->stream = 0; 555 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 556 cusparseStruct->deviceMat = NULL; 557 } 558 559 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 560 A->ops->mult = MatMult_MPIAIJCUSPARSE; 561 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 562 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 563 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 564 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 565 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 566 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 567 568 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 569 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 570 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 571 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 572 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 573 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 574 PetscFunctionReturn(0); 575 } 576 577 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 578 { 579 PetscErrorCode ierr; 580 581 PetscFunctionBegin; 582 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 583 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 584 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 585 PetscFunctionReturn(0); 586 } 587 588 /*@ 589 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 590 (the default parallel PETSc format). This matrix will ultimately pushed down 591 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 592 assembly performance the user should preallocate the matrix storage by setting 593 the parameter nz (or the array nnz). By setting these parameters accurately, 594 performance during matrix assembly can be increased by more than a factor of 50. 595 596 Collective 597 598 Input Parameters: 599 + comm - MPI communicator, set to PETSC_COMM_SELF 600 . m - number of rows 601 . n - number of columns 602 . nz - number of nonzeros per row (same for all rows) 603 - nnz - array containing the number of nonzeros in the various rows 604 (possibly different for each row) or NULL 605 606 Output Parameter: 607 . A - the matrix 608 609 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 610 MatXXXXSetPreallocation() paradigm instead of this routine directly. 611 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 612 613 Notes: 614 If nnz is given then nz is ignored 615 616 The AIJ format (also called the Yale sparse matrix format or 617 compressed row storage), is fully compatible with standard Fortran 77 618 storage. That is, the stored row and column indices can begin at 619 either one (as in Fortran) or zero. See the users' manual for details. 620 621 Specify the preallocated storage with either nz or nnz (not both). 622 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 623 allocation. For large problems you MUST preallocate memory or you 624 will get TERRIBLE performance, see the users' manual chapter on matrices. 625 626 By default, this format uses inodes (identical nodes) when possible, to 627 improve numerical efficiency of matrix-vector products and solves. We 628 search for consecutive rows with the same nonzero structure, thereby 629 reusing matrix information to achieve increased efficiency. 630 631 Level: intermediate 632 633 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 634 @*/ 635 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 636 { 637 PetscErrorCode ierr; 638 PetscMPIInt size; 639 640 PetscFunctionBegin; 641 ierr = MatCreate(comm,A);CHKERRQ(ierr); 642 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 643 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 644 if (size > 1) { 645 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 646 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 647 } else { 648 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 649 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 650 } 651 PetscFunctionReturn(0); 652 } 653 654 /*MC 655 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 656 657 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 658 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 659 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 660 661 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 662 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 663 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 664 for communicators controlling multiple processes. It is recommended that you call both of 665 the above preallocation routines for simplicity. 666 667 Options Database Keys: 668 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 669 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 670 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 671 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 672 673 Level: beginner 674 675 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 676 M 677 M*/ 678 679 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 680 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 681 { 682 #if defined(PETSC_USE_CTABLE) 683 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 684 #else 685 PetscSplitCSRDataStructure **p_d_mat; 686 PetscMPIInt size,rank; 687 MPI_Comm comm; 688 PetscErrorCode ierr; 689 int *ai,*bi,*aj,*bj; 690 PetscScalar *aa,*ba; 691 692 PetscFunctionBegin; 693 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 694 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 695 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 696 if (A->factortype == MAT_FACTOR_NONE) { 697 CsrMatrix *matrixA,*matrixB=NULL; 698 if (size == 1) { 699 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 700 p_d_mat = &cusparsestruct->deviceMat; 701 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 702 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 703 matrixA = (CsrMatrix*)matstruct->mat; 704 bi = bj = NULL; ba = NULL; 705 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 706 } else { 707 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 708 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 709 p_d_mat = &spptr->deviceMat; 710 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 711 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 712 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 713 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 714 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 715 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 716 matrixA = (CsrMatrix*)matstructA->mat; 717 matrixB = (CsrMatrix*)matstructB->mat; 718 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 719 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 720 ba = thrust::raw_pointer_cast(matrixB->values->data()); 721 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 722 } 723 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 724 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 725 aa = thrust::raw_pointer_cast(matrixA->values->data()); 726 } else { 727 *B = NULL; 728 PetscFunctionReturn(0); 729 } 730 // act like MatSetValues because not called on host 731 if (A->assembled) { 732 if (A->was_assembled) { 733 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 734 } 735 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 736 } else { 737 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 738 } 739 if (!*p_d_mat) { 740 cudaError_t err; 741 PetscSplitCSRDataStructure *d_mat, h_mat; 742 Mat_SeqAIJ *jaca; 743 PetscInt n = A->rmap->n, nnz; 744 // create and copy 745 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 746 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 747 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 748 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 749 if (size == 1) { 750 jaca = (Mat_SeqAIJ*)A->data; 751 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 752 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 753 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 754 h_mat.offdiag.a = NULL; 755 } else { 756 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 757 Mat_SeqAIJ *jacb; 758 jaca = (Mat_SeqAIJ*)aij->A->data; 759 jacb = (Mat_SeqAIJ*)aij->B->data; 760 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 761 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 762 // create colmap - this is ussually done (lazy) in MatSetValues 763 aij->donotstash = PETSC_TRUE; 764 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 765 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 766 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 767 aij->colmap[A->cmap->N] = -9; 768 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 769 { 770 PetscInt ii; 771 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 772 } 773 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 774 // allocate B copy data 775 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 776 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 777 nnz = jacb->i[n]; 778 779 if (jacb->compressedrow.use) { 780 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 781 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 782 } else h_mat.offdiag.i = bi; 783 h_mat.offdiag.j = bj; 784 h_mat.offdiag.a = ba; 785 786 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 787 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 788 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 789 h_mat.offdiag.n = n; 790 } 791 // allocate A copy data 792 nnz = jaca->i[n]; 793 h_mat.diag.n = n; 794 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 795 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 796 if (jaca->compressedrow.use) { 797 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 798 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 799 } else { 800 h_mat.diag.i = ai; 801 } 802 h_mat.diag.j = aj; 803 h_mat.diag.a = aa; 804 // copy pointers and metdata to device 805 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 806 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 807 } else { 808 *B = *p_d_mat; 809 } 810 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 811 PetscFunctionReturn(0); 812 #endif 813 } 814