1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 25 { 26 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; 27 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 28 29 PetscFunctionBegin; 30 if (!cusparseStruct) PetscFunctionReturn(0); 31 if (cusparseStruct->use_extended_coo) { 32 PetscCallCUDA(cudaFree(cusparseStruct->Aimap1_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Bimap1_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 41 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 42 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 43 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 44 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 45 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 46 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 47 } 48 cusparseStruct->use_extended_coo = PETSC_FALSE; 49 delete cusparseStruct->coo_p; 50 delete cusparseStruct->coo_pw; 51 cusparseStruct->coo_p = NULL; 52 cusparseStruct->coo_pw = NULL; 53 PetscFunctionReturn(0); 54 } 55 56 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 57 { 58 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 59 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 60 PetscInt n = cusp->coo_nd + cusp->coo_no; 61 62 PetscFunctionBegin; 63 if (cusp->coo_p && v) { 64 thrust::device_ptr<const PetscScalar> d_v; 65 THRUSTARRAY *w = NULL; 66 67 if (isCudaMem(v)) { 68 d_v = thrust::device_pointer_cast(v); 69 } else { 70 w = new THRUSTARRAY(n); 71 w->assign(v,v+n); 72 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 73 d_v = w->data(); 74 } 75 76 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 77 cusp->coo_pw->begin())); 78 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 79 cusp->coo_pw->end())); 80 PetscCall(PetscLogGpuTimeBegin()); 81 thrust::for_each(zibit,zieit,VecCUDAEquals()); 82 PetscCall(PetscLogGpuTimeEnd()); 83 delete w; 84 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode)); 85 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode)); 86 } else { 87 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode)); 88 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode)); 89 } 90 PetscFunctionReturn(0); 91 } 92 93 template <typename Tuple> 94 struct IsNotOffDiagT 95 { 96 PetscInt _cstart,_cend; 97 98 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 99 __host__ __device__ 100 inline bool operator()(Tuple t) 101 { 102 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 103 } 104 }; 105 106 struct IsOffDiag 107 { 108 PetscInt _cstart,_cend; 109 110 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 111 __host__ __device__ 112 inline bool operator() (const PetscInt &c) 113 { 114 return c < _cstart || c >= _cend; 115 } 116 }; 117 118 struct GlobToLoc 119 { 120 PetscInt _start; 121 122 GlobToLoc(PetscInt start) : _start(start) {} 123 __host__ __device__ 124 inline PetscInt operator() (const PetscInt &c) 125 { 126 return c - _start; 127 } 128 }; 129 130 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 131 { 132 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 133 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 134 PetscInt N,*jj; 135 size_t noff = 0; 136 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 137 THRUSTINTARRAY d_j(n); 138 ISLocalToGlobalMapping l2g; 139 140 PetscFunctionBegin; 141 if (b->A) PetscCall(MatCUSPARSEClearHandle(b->A)); 142 if (b->B) PetscCall(MatCUSPARSEClearHandle(b->B)); 143 PetscCall(MatDestroy(&b->A)); 144 PetscCall(MatDestroy(&b->B)); 145 146 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 147 d_i.assign(coo_i,coo_i+n); 148 d_j.assign(coo_j,coo_j+n); 149 PetscCall(PetscLogGpuTimeBegin()); 150 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 151 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 152 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 153 cusp->coo_p = new THRUSTINTARRAY(n); 154 cusp->coo_pw = new THRUSTARRAY(n); 155 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 156 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 157 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 158 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 159 firstoffd = mzipp.get_iterator_tuple().get<1>(); 160 } 161 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 162 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 163 164 /* from global to local */ 165 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 166 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 167 PetscCall(PetscLogGpuTimeEnd()); 168 169 /* copy offdiag column indices to map on the CPU */ 170 PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */ 171 PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 172 auto o_j = d_j.begin(); 173 PetscCall(PetscLogGpuTimeBegin()); 174 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 175 thrust::sort(thrust::device,o_j,d_j.end()); 176 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 177 PetscCall(PetscLogGpuTimeEnd()); 178 noff = thrust::distance(o_j,wit); 179 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 180 PetscCall(PetscMalloc1(noff,&b->garray)); 181 PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 182 PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt))); 183 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g)); 184 PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH)); 185 PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj)); 186 PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 187 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 188 189 PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 190 PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 191 PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 192 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 193 PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 194 PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff)); 195 PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 196 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 197 198 /* GPU memory, cusparse specific call handles it internally */ 199 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get())); 200 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj)); 201 PetscCall(PetscFree(jj)); 202 203 PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat)); 204 PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat)); 205 PetscCall(MatCUSPARSESetHandle(b->A,cusp->handle)); 206 PetscCall(MatCUSPARSESetHandle(b->B,cusp->handle)); 207 /* 208 PetscCall(MatCUSPARSESetStream(b->A,cusp->stream)); 209 PetscCall(MatCUSPARSESetStream(b->B,cusp->stream)); 210 */ 211 212 PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 213 PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 214 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 215 PetscFunctionReturn(0); 216 } 217 218 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 219 { 220 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 221 Mat_MPIAIJCUSPARSE *mpidev; 222 PetscBool coo_basic = PETSC_TRUE; 223 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 224 PetscInt rstart,rend; 225 226 PetscFunctionBegin; 227 PetscCall(PetscFree(mpiaij->garray)); 228 PetscCall(VecDestroy(&mpiaij->lvec)); 229 #if defined(PETSC_USE_CTABLE) 230 PetscCall(PetscTableDestroy(&mpiaij->colmap)); 231 #else 232 PetscCall(PetscFree(mpiaij->colmap)); 233 #endif 234 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 235 mat->assembled = PETSC_FALSE; 236 mat->was_assembled = PETSC_FALSE; 237 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 238 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 239 if (coo_i) { 240 PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend)); 241 PetscCall(PetscGetMemType(coo_i,&mtype)); 242 if (PetscMemTypeHost(mtype)) { 243 for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 244 if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;} 245 } 246 } 247 } 248 /* All ranks must agree on the value of coo_basic */ 249 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat))); 250 if (coo_basic) { 251 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 252 } else { 253 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j)); 254 mat->offloadmask = PETSC_OFFLOAD_CPU; 255 /* creates the GPU memory */ 256 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 257 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 258 mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 259 mpidev->use_extended_coo = PETSC_TRUE; 260 261 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount))); 262 PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount))); 263 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount))); 264 265 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount))); 266 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount))); 267 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount))); 268 269 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount))); 270 PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount))); 271 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount))); 272 273 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount))); 274 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount))); 275 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount))); 276 277 PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount))); 278 PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar))); 279 PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar))); 280 281 PetscCallCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 282 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 283 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 284 285 PetscCallCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 286 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 287 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 288 289 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 290 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 291 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 292 293 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 294 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 295 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 296 297 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice)); 298 } 299 PetscFunctionReturn(0); 300 } 301 302 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 303 { 304 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 305 const PetscCount grid_size = gridDim.x * blockDim.x; 306 for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 307 } 308 309 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 310 { 311 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 312 const PetscCount grid_size = gridDim.x * blockDim.x; 313 for (; i<nnz; i += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 314 } 315 316 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 317 { 318 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 319 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 320 Mat A = mpiaij->A,B = mpiaij->B; 321 PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 322 PetscScalar *Aa,*Ba = NULL; 323 PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 324 const PetscScalar *v1 = v; 325 const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 326 const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 327 const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 328 const PetscCount *Cperm1 = mpidev->Cperm1_d; 329 PetscMemType memtype; 330 331 PetscFunctionBegin; 332 if (mpidev->use_extended_coo) { 333 PetscMPIInt size; 334 335 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size)); 336 PetscCall(PetscGetMemType(v,&memtype)); 337 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 338 PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar))); 339 PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 340 } 341 342 if (imode == INSERT_VALUES) { 343 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */ 344 PetscCallCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar))); 345 if (size > 1) { 346 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba)); 347 PetscCallCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar))); 348 } 349 } else { 350 PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */ 351 if (size > 1) PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba)); 352 } 353 354 /* Pack entries to be sent to remote */ 355 if (mpiaij->sendlen) { 356 MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 357 PetscCallCUDA(cudaPeekAtLastError()); 358 } 359 360 /* Send remote entries to their owner and overlap the communication with local computation */ 361 if (size > 1) PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE)); 362 /* Add local entries to A and B */ 363 if (Annz1) { 364 MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 365 PetscCallCUDA(cudaPeekAtLastError()); 366 } 367 if (Bnnz1) { 368 MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 369 PetscCallCUDA(cudaPeekAtLastError()); 370 } 371 if (size > 1) PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE)); 372 373 /* Add received remote entries to A and B */ 374 if (Annz2) { 375 MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 376 PetscCallCUDA(cudaPeekAtLastError()); 377 } 378 if (Bnnz2) { 379 MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 380 PetscCallCUDA(cudaPeekAtLastError()); 381 } 382 383 if (imode == INSERT_VALUES) { 384 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 385 if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba)); 386 } else { 387 PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 388 if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba)); 389 } 390 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 391 } else { 392 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode)); 393 } 394 mat->offloadmask = PETSC_OFFLOAD_GPU; 395 PetscFunctionReturn(0); 396 } 397 398 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 399 { 400 Mat Ad,Ao; 401 const PetscInt *cmap; 402 403 PetscFunctionBegin; 404 PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap)); 405 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc)); 406 if (glob) { 407 PetscInt cst, i, dn, on, *gidx; 408 409 PetscCall(MatGetLocalSize(Ad,NULL,&dn)); 410 PetscCall(MatGetLocalSize(Ao,NULL,&on)); 411 PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL)); 412 PetscCall(PetscMalloc1(dn+on,&gidx)); 413 for (i = 0; i<dn; i++) gidx[i] = cst + i; 414 for (i = 0; i<on; i++) gidx[i+dn] = cmap[i]; 415 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob)); 416 } 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 421 { 422 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 423 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 424 PetscInt i; 425 426 PetscFunctionBegin; 427 PetscCall(PetscLayoutSetUp(B->rmap)); 428 PetscCall(PetscLayoutSetUp(B->cmap)); 429 if (PetscDefined(USE_DEBUG) && d_nnz) { 430 for (i=0; i<B->rmap->n; i++) { 431 PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 432 } 433 } 434 if (PetscDefined(USE_DEBUG) && o_nnz) { 435 for (i=0; i<B->rmap->n; i++) { 436 PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 437 } 438 } 439 #if defined(PETSC_USE_CTABLE) 440 PetscCall(PetscTableDestroy(&b->colmap)); 441 #else 442 PetscCall(PetscFree(b->colmap)); 443 #endif 444 PetscCall(PetscFree(b->garray)); 445 PetscCall(VecDestroy(&b->lvec)); 446 PetscCall(VecScatterDestroy(&b->Mvctx)); 447 /* Because the B will have been resized we simply destroy it and create a new one each time */ 448 PetscCall(MatDestroy(&b->B)); 449 if (!b->A) { 450 PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 451 PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 452 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 453 } 454 if (!b->B) { 455 PetscMPIInt size; 456 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 457 PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 458 PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0)); 459 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 460 } 461 PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 462 PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 463 PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 464 PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 465 PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz)); 466 PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz)); 467 PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat)); 468 PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat)); 469 PetscCall(MatCUSPARSESetHandle(b->A,cusparseStruct->handle)); 470 PetscCall(MatCUSPARSESetHandle(b->B,cusparseStruct->handle)); 471 B->preallocated = PETSC_TRUE; 472 PetscFunctionReturn(0); 473 } 474 475 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 476 { 477 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 478 479 PetscFunctionBegin; 480 PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 481 PetscCall((*a->A->ops->mult)(a->A,xx,yy)); 482 PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 483 PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy)); 484 PetscFunctionReturn(0); 485 } 486 487 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 488 { 489 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 490 491 PetscFunctionBegin; 492 PetscCall(MatZeroEntries(l->A)); 493 PetscCall(MatZeroEntries(l->B)); 494 PetscFunctionReturn(0); 495 } 496 497 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 498 { 499 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 500 501 PetscFunctionBegin; 502 PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 503 PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz)); 504 PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 505 PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz)); 506 PetscFunctionReturn(0); 507 } 508 509 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 510 { 511 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 512 513 PetscFunctionBegin; 514 PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec)); 515 PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy)); 516 PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 517 PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 518 PetscFunctionReturn(0); 519 } 520 521 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 522 { 523 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 524 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 525 526 PetscFunctionBegin; 527 switch (op) { 528 case MAT_CUSPARSE_MULT_DIAG: 529 cusparseStruct->diagGPUMatFormat = format; 530 break; 531 case MAT_CUSPARSE_MULT_OFFDIAG: 532 cusparseStruct->offdiagGPUMatFormat = format; 533 break; 534 case MAT_CUSPARSE_ALL: 535 cusparseStruct->diagGPUMatFormat = format; 536 cusparseStruct->offdiagGPUMatFormat = format; 537 break; 538 default: 539 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 540 } 541 PetscFunctionReturn(0); 542 } 543 544 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 545 { 546 MatCUSPARSEStorageFormat format; 547 PetscBool flg; 548 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 549 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 550 551 PetscFunctionBegin; 552 PetscCall(PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options")); 553 if (A->factortype==MAT_FACTOR_NONE) { 554 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 555 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 556 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format)); 557 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 558 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg)); 559 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format)); 560 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 561 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 562 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 563 } 564 PetscCall(PetscOptionsTail()); 565 PetscFunctionReturn(0); 566 } 567 568 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 569 { 570 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 571 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 572 PetscObjectState onnz = A->nonzerostate; 573 574 PetscFunctionBegin; 575 PetscCall(MatAssemblyEnd_MPIAIJ(A,mode)); 576 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA)); 577 if (onnz != A->nonzerostate && cusp->deviceMat) { 578 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 579 580 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 581 PetscCall(PetscNew(&h_mat)); 582 PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 583 PetscCallCUDA(cudaFree(h_mat->colmap)); 584 if (h_mat->allocated_indices) { 585 PetscCallCUDA(cudaFree(h_mat->diag.i)); 586 PetscCallCUDA(cudaFree(h_mat->diag.j)); 587 if (h_mat->offdiag.j) { 588 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 589 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 590 } 591 } 592 PetscCallCUDA(cudaFree(d_mat)); 593 PetscCall(PetscFree(h_mat)); 594 cusp->deviceMat = NULL; 595 } 596 PetscFunctionReturn(0); 597 } 598 599 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 600 { 601 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 602 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 603 604 PetscFunctionBegin; 605 PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 606 if (cusparseStruct->deviceMat) { 607 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 608 609 PetscCall(PetscInfo(A,"Have device matrix\n")); 610 PetscCall(PetscNew(&h_mat)); 611 PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 612 PetscCallCUDA(cudaFree(h_mat->colmap)); 613 if (h_mat->allocated_indices) { 614 PetscCallCUDA(cudaFree(h_mat->diag.i)); 615 PetscCallCUDA(cudaFree(h_mat->diag.j)); 616 if (h_mat->offdiag.j) { 617 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 618 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 619 } 620 } 621 PetscCallCUDA(cudaFree(d_mat)); 622 PetscCall(PetscFree(h_mat)); 623 } 624 try { 625 if (aij->A) PetscCall(MatCUSPARSEClearHandle(aij->A)); 626 if (aij->B) PetscCall(MatCUSPARSEClearHandle(aij->B)); 627 PetscCallCUSPARSE(cusparseDestroy(cusparseStruct->handle)); 628 /* We want cusparseStruct to use PetscDefaultCudaStream 629 if (cusparseStruct->stream) { 630 PetscCallCUDA(cudaStreamDestroy(cusparseStruct->stream)); 631 } 632 */ 633 /* Free COO */ 634 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 635 delete cusparseStruct; 636 } catch(char *ex) { 637 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 638 } 639 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL)); 640 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL)); 641 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 642 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 643 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 644 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL)); 645 PetscCall(MatDestroy_MPIAIJ(A)); 646 PetscFunctionReturn(0); 647 } 648 649 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 650 { 651 Mat_MPIAIJ *a; 652 Mat A; 653 654 PetscFunctionBegin; 655 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 656 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat)); 657 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN)); 658 A = *newmat; 659 A->boundtocpu = PETSC_FALSE; 660 PetscCall(PetscFree(A->defaultvectype)); 661 PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 662 663 a = (Mat_MPIAIJ*)A->data; 664 if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE)); 665 if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE)); 666 if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA)); 667 668 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 669 Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 670 PetscCallCUSPARSE(cusparseCreate(&(cusp->handle))); 671 a->spptr = cusp; 672 } 673 674 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 675 A->ops->mult = MatMult_MPIAIJCUSPARSE; 676 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 677 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 678 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 679 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 680 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 681 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 682 683 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE)); 684 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 685 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 686 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 687 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE)); 688 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE)); 689 #if defined(PETSC_HAVE_HYPRE) 690 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 691 #endif 692 PetscFunctionReturn(0); 693 } 694 695 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 696 { 697 PetscFunctionBegin; 698 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 699 PetscCall(MatCreate_MPIAIJ(A)); 700 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A)); 701 PetscFunctionReturn(0); 702 } 703 704 /*@ 705 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 706 (the default parallel PETSc format). This matrix will ultimately pushed down 707 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 708 assembly performance the user should preallocate the matrix storage by setting 709 the parameter nz (or the array nnz). By setting these parameters accurately, 710 performance during matrix assembly can be increased by more than a factor of 50. 711 712 Collective 713 714 Input Parameters: 715 + comm - MPI communicator, set to PETSC_COMM_SELF 716 . m - number of rows 717 . n - number of columns 718 . nz - number of nonzeros per row (same for all rows) 719 - nnz - array containing the number of nonzeros in the various rows 720 (possibly different for each row) or NULL 721 722 Output Parameter: 723 . A - the matrix 724 725 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 726 MatXXXXSetPreallocation() paradigm instead of this routine directly. 727 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 728 729 Notes: 730 If nnz is given then nz is ignored 731 732 The AIJ format (also called the Yale sparse matrix format or 733 compressed row storage), is fully compatible with standard Fortran 77 734 storage. That is, the stored row and column indices can begin at 735 either one (as in Fortran) or zero. See the users' manual for details. 736 737 Specify the preallocated storage with either nz or nnz (not both). 738 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 739 allocation. For large problems you MUST preallocate memory or you 740 will get TERRIBLE performance, see the users' manual chapter on matrices. 741 742 By default, this format uses inodes (identical nodes) when possible, to 743 improve numerical efficiency of matrix-vector products and solves. We 744 search for consecutive rows with the same nonzero structure, thereby 745 reusing matrix information to achieve increased efficiency. 746 747 Level: intermediate 748 749 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 750 @*/ 751 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 752 { 753 PetscMPIInt size; 754 755 PetscFunctionBegin; 756 PetscCall(MatCreate(comm,A)); 757 PetscCall(MatSetSizes(*A,m,n,M,N)); 758 PetscCallMPI(MPI_Comm_size(comm,&size)); 759 if (size > 1) { 760 PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE)); 761 PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz)); 762 } else { 763 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 764 PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz)); 765 } 766 PetscFunctionReturn(0); 767 } 768 769 /*MC 770 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 771 772 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 773 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 774 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 775 776 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 777 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 778 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 779 for communicators controlling multiple processes. It is recommended that you call both of 780 the above preallocation routines for simplicity. 781 782 Options Database Keys: 783 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 784 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 785 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 786 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 787 788 Level: beginner 789 790 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 791 M*/ 792 793 /*MC 794 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 795 796 Level: beginner 797 798 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 799 M*/ 800 801 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 802 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 803 { 804 PetscSplitCSRDataStructure d_mat; 805 PetscMPIInt size; 806 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 807 PetscScalar *aa = NULL,*ba = NULL; 808 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 809 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 810 CsrMatrix *matrixA = NULL,*matrixB = NULL; 811 812 PetscFunctionBegin; 813 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 814 if (A->factortype != MAT_FACTOR_NONE) { 815 *B = NULL; 816 PetscFunctionReturn(0); 817 } 818 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 819 // get jaca 820 if (size == 1) { 821 PetscBool isseqaij; 822 823 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij)); 824 if (isseqaij) { 825 jaca = (Mat_SeqAIJ*)A->data; 826 PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 827 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 828 d_mat = cusparsestructA->deviceMat; 829 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 830 } else { 831 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 832 PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 833 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 834 jaca = (Mat_SeqAIJ*)aij->A->data; 835 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 836 d_mat = spptr->deviceMat; 837 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 838 } 839 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 840 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 841 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 842 matrixA = (CsrMatrix*)matstruct->mat; 843 bi = NULL; 844 bj = NULL; 845 ba = NULL; 846 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 847 } else { 848 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 849 PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 850 jaca = (Mat_SeqAIJ*)aij->A->data; 851 jacb = (Mat_SeqAIJ*)aij->B->data; 852 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 853 854 PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 855 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 856 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 857 PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 858 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 859 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 860 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 861 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 862 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 863 PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 864 PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 865 matrixA = (CsrMatrix*)matstructA->mat; 866 matrixB = (CsrMatrix*)matstructB->mat; 867 if (jacb->compressedrow.use) { 868 if (!cusparsestructB->rowoffsets_gpu) { 869 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 870 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 871 } 872 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 873 } else { 874 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 875 } 876 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 877 ba = thrust::raw_pointer_cast(matrixB->values->data()); 878 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 879 d_mat = spptr->deviceMat; 880 } 881 if (jaca->compressedrow.use) { 882 if (!cusparsestructA->rowoffsets_gpu) { 883 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 884 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 885 } 886 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 887 } else { 888 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 889 } 890 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 891 aa = thrust::raw_pointer_cast(matrixA->values->data()); 892 893 if (!d_mat) { 894 PetscSplitCSRDataStructure h_mat; 895 896 // create and populate strucy on host and copy on device 897 PetscCall(PetscInfo(A,"Create device matrix\n")); 898 PetscCall(PetscNew(&h_mat)); 899 PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat))); 900 if (size > 1) { /* need the colmap array */ 901 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 902 PetscInt *colmap; 903 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 904 905 PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 906 907 PetscCall(PetscCalloc1(N+1,&colmap)); 908 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 909 #if defined(PETSC_USE_64BIT_INDICES) 910 { // have to make a long version of these 911 int *h_bi32, *h_bj32; 912 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 913 PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64)); 914 PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost)); 915 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 916 PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost)); 917 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 918 919 PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64))); 920 PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice)); 921 PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64))); 922 PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice)); 923 924 h_mat->offdiag.i = d_bi64; 925 h_mat->offdiag.j = d_bj64; 926 h_mat->allocated_indices = PETSC_TRUE; 927 928 PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64)); 929 } 930 #else 931 h_mat->offdiag.i = (PetscInt*)bi; 932 h_mat->offdiag.j = (PetscInt*)bj; 933 h_mat->allocated_indices = PETSC_FALSE; 934 #endif 935 h_mat->offdiag.a = ba; 936 h_mat->offdiag.n = A->rmap->n; 937 938 PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap))); 939 PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice)); 940 PetscCall(PetscFree(colmap)); 941 } 942 h_mat->rstart = A->rmap->rstart; 943 h_mat->rend = A->rmap->rend; 944 h_mat->cstart = A->cmap->rstart; 945 h_mat->cend = A->cmap->rend; 946 h_mat->M = A->cmap->N; 947 #if defined(PETSC_USE_64BIT_INDICES) 948 { 949 PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 950 int *h_ai32, *h_aj32; 951 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 952 PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64)); 953 PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost)); 954 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 955 PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost)); 956 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 957 958 PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64))); 959 PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice)); 960 PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64))); 961 PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice)); 962 963 h_mat->diag.i = d_ai64; 964 h_mat->diag.j = d_aj64; 965 h_mat->allocated_indices = PETSC_TRUE; 966 967 PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64)); 968 } 969 #else 970 h_mat->diag.i = (PetscInt*)ai; 971 h_mat->diag.j = (PetscInt*)aj; 972 h_mat->allocated_indices = PETSC_FALSE; 973 #endif 974 h_mat->diag.a = aa; 975 h_mat->diag.n = A->rmap->n; 976 h_mat->rank = PetscGlobalRank; 977 // copy pointers and metadata to device 978 PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice)); 979 PetscCall(PetscFree(h_mat)); 980 } else { 981 PetscCall(PetscInfo(A,"Reusing device matrix\n")); 982 } 983 *B = d_mat; 984 A->offloadmask = PETSC_OFFLOAD_GPU; 985 PetscFunctionReturn(0); 986 } 987