1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 25 { 26 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; 27 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 28 29 PetscFunctionBegin; 30 if (!cusparseStruct) PetscFunctionReturn(0); 31 if (cusparseStruct->use_extended_coo) { 32 PetscCallCUDA(cudaFree(cusparseStruct->Aimap1_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Bimap1_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 41 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 42 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 43 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 44 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 45 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 46 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 47 } 48 cusparseStruct->use_extended_coo = PETSC_FALSE; 49 delete cusparseStruct->coo_p; 50 delete cusparseStruct->coo_pw; 51 cusparseStruct->coo_p = NULL; 52 cusparseStruct->coo_pw = NULL; 53 PetscFunctionReturn(0); 54 } 55 56 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 57 { 58 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 59 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 60 PetscInt n = cusp->coo_nd + cusp->coo_no; 61 62 PetscFunctionBegin; 63 if (cusp->coo_p && v) { 64 thrust::device_ptr<const PetscScalar> d_v; 65 THRUSTARRAY *w = NULL; 66 67 if (isCudaMem(v)) { 68 d_v = thrust::device_pointer_cast(v); 69 } else { 70 w = new THRUSTARRAY(n); 71 w->assign(v,v+n); 72 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 73 d_v = w->data(); 74 } 75 76 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 77 cusp->coo_pw->begin())); 78 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 79 cusp->coo_pw->end())); 80 PetscCall(PetscLogGpuTimeBegin()); 81 thrust::for_each(zibit,zieit,VecCUDAEquals()); 82 PetscCall(PetscLogGpuTimeEnd()); 83 delete w; 84 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode)); 85 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode)); 86 } else { 87 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode)); 88 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode)); 89 } 90 PetscFunctionReturn(0); 91 } 92 93 template <typename Tuple> 94 struct IsNotOffDiagT 95 { 96 PetscInt _cstart,_cend; 97 98 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 99 __host__ __device__ 100 inline bool operator()(Tuple t) 101 { 102 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 103 } 104 }; 105 106 struct IsOffDiag 107 { 108 PetscInt _cstart,_cend; 109 110 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 111 __host__ __device__ 112 inline bool operator() (const PetscInt &c) 113 { 114 return c < _cstart || c >= _cend; 115 } 116 }; 117 118 struct GlobToLoc 119 { 120 PetscInt _start; 121 122 GlobToLoc(PetscInt start) : _start(start) {} 123 __host__ __device__ 124 inline PetscInt operator() (const PetscInt &c) 125 { 126 return c - _start; 127 } 128 }; 129 130 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 131 { 132 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 133 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 134 PetscInt N,*jj; 135 size_t noff = 0; 136 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 137 THRUSTINTARRAY d_j(n); 138 ISLocalToGlobalMapping l2g; 139 140 PetscFunctionBegin; 141 PetscCall(MatDestroy(&b->A)); 142 PetscCall(MatDestroy(&b->B)); 143 144 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 145 d_i.assign(coo_i,coo_i+n); 146 d_j.assign(coo_j,coo_j+n); 147 PetscCall(PetscLogGpuTimeBegin()); 148 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 149 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 150 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 151 cusp->coo_p = new THRUSTINTARRAY(n); 152 cusp->coo_pw = new THRUSTARRAY(n); 153 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 154 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 155 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 156 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 157 firstoffd = mzipp.get_iterator_tuple().get<1>(); 158 } 159 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 160 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 161 162 /* from global to local */ 163 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 164 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 165 PetscCall(PetscLogGpuTimeEnd()); 166 167 /* copy offdiag column indices to map on the CPU */ 168 PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */ 169 PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 170 auto o_j = d_j.begin(); 171 PetscCall(PetscLogGpuTimeBegin()); 172 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 173 thrust::sort(thrust::device,o_j,d_j.end()); 174 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 175 PetscCall(PetscLogGpuTimeEnd()); 176 noff = thrust::distance(o_j,wit); 177 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 178 PetscCall(PetscMalloc1(noff,&b->garray)); 179 PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 180 PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt))); 181 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g)); 182 PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH)); 183 PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj)); 184 PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 185 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 186 187 PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 188 PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 189 PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 190 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 191 PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 192 PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff)); 193 PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 194 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 195 196 /* GPU memory, cusparse specific call handles it internally */ 197 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get())); 198 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj)); 199 PetscCall(PetscFree(jj)); 200 201 PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat)); 202 PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat)); 203 204 PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 205 PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 206 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 207 PetscFunctionReturn(0); 208 } 209 210 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 211 { 212 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 213 Mat_MPIAIJCUSPARSE *mpidev; 214 PetscBool coo_basic = PETSC_TRUE; 215 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 216 PetscInt rstart,rend; 217 218 PetscFunctionBegin; 219 PetscCall(PetscFree(mpiaij->garray)); 220 PetscCall(VecDestroy(&mpiaij->lvec)); 221 #if defined(PETSC_USE_CTABLE) 222 PetscCall(PetscTableDestroy(&mpiaij->colmap)); 223 #else 224 PetscCall(PetscFree(mpiaij->colmap)); 225 #endif 226 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 227 mat->assembled = PETSC_FALSE; 228 mat->was_assembled = PETSC_FALSE; 229 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 230 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 231 if (coo_i) { 232 PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend)); 233 PetscCall(PetscGetMemType(coo_i,&mtype)); 234 if (PetscMemTypeHost(mtype)) { 235 for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 236 if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;} 237 } 238 } 239 } 240 /* All ranks must agree on the value of coo_basic */ 241 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat))); 242 if (coo_basic) { 243 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 244 } else { 245 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j)); 246 mat->offloadmask = PETSC_OFFLOAD_CPU; 247 /* creates the GPU memory */ 248 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 249 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 250 mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 251 mpidev->use_extended_coo = PETSC_TRUE; 252 253 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount))); 254 PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount))); 255 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount))); 256 257 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount))); 258 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount))); 259 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount))); 260 261 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount))); 262 PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount))); 263 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount))); 264 265 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount))); 266 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount))); 267 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount))); 268 269 PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount))); 270 PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar))); 271 PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar))); 272 273 PetscCallCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 274 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 275 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 276 277 PetscCallCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 278 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 279 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 280 281 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 282 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 283 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 284 285 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 286 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 287 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 288 289 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice)); 290 } 291 PetscFunctionReturn(0); 292 } 293 294 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 295 { 296 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 297 const PetscCount grid_size = gridDim.x * blockDim.x; 298 for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 299 } 300 301 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 302 { 303 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 304 const PetscCount grid_size = gridDim.x * blockDim.x; 305 for (; i<nnz; i += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 306 } 307 308 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 309 { 310 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 311 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 312 Mat A = mpiaij->A,B = mpiaij->B; 313 PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 314 PetscScalar *Aa,*Ba = NULL; 315 PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 316 const PetscScalar *v1 = v; 317 const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 318 const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 319 const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 320 const PetscCount *Cperm1 = mpidev->Cperm1_d; 321 PetscMemType memtype; 322 323 PetscFunctionBegin; 324 if (mpidev->use_extended_coo) { 325 PetscMPIInt size; 326 327 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size)); 328 PetscCall(PetscGetMemType(v,&memtype)); 329 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 330 PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar))); 331 PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 332 } 333 334 if (imode == INSERT_VALUES) { 335 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */ 336 PetscCallCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar))); 337 if (size > 1) { 338 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba)); 339 PetscCallCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar))); 340 } 341 } else { 342 PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */ 343 if (size > 1) PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba)); 344 } 345 346 /* Pack entries to be sent to remote */ 347 if (mpiaij->sendlen) { 348 MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 349 PetscCallCUDA(cudaPeekAtLastError()); 350 } 351 352 /* Send remote entries to their owner and overlap the communication with local computation */ 353 if (size > 1) PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE)); 354 /* Add local entries to A and B */ 355 if (Annz1) { 356 MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 357 PetscCallCUDA(cudaPeekAtLastError()); 358 } 359 if (Bnnz1) { 360 MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 361 PetscCallCUDA(cudaPeekAtLastError()); 362 } 363 if (size > 1) PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE)); 364 365 /* Add received remote entries to A and B */ 366 if (Annz2) { 367 MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 368 PetscCallCUDA(cudaPeekAtLastError()); 369 } 370 if (Bnnz2) { 371 MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 372 PetscCallCUDA(cudaPeekAtLastError()); 373 } 374 375 if (imode == INSERT_VALUES) { 376 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 377 if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba)); 378 } else { 379 PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 380 if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba)); 381 } 382 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 383 } else { 384 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode)); 385 } 386 mat->offloadmask = PETSC_OFFLOAD_GPU; 387 PetscFunctionReturn(0); 388 } 389 390 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 391 { 392 Mat Ad,Ao; 393 const PetscInt *cmap; 394 395 PetscFunctionBegin; 396 PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap)); 397 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc)); 398 if (glob) { 399 PetscInt cst, i, dn, on, *gidx; 400 401 PetscCall(MatGetLocalSize(Ad,NULL,&dn)); 402 PetscCall(MatGetLocalSize(Ao,NULL,&on)); 403 PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL)); 404 PetscCall(PetscMalloc1(dn+on,&gidx)); 405 for (i = 0; i<dn; i++) gidx[i] = cst + i; 406 for (i = 0; i<on; i++) gidx[i+dn] = cmap[i]; 407 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob)); 408 } 409 PetscFunctionReturn(0); 410 } 411 412 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 413 { 414 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 415 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 416 PetscInt i; 417 418 PetscFunctionBegin; 419 PetscCall(PetscLayoutSetUp(B->rmap)); 420 PetscCall(PetscLayoutSetUp(B->cmap)); 421 if (PetscDefined(USE_DEBUG) && d_nnz) { 422 for (i=0; i<B->rmap->n; i++) { 423 PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 424 } 425 } 426 if (PetscDefined(USE_DEBUG) && o_nnz) { 427 for (i=0; i<B->rmap->n; i++) { 428 PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 429 } 430 } 431 #if defined(PETSC_USE_CTABLE) 432 PetscCall(PetscTableDestroy(&b->colmap)); 433 #else 434 PetscCall(PetscFree(b->colmap)); 435 #endif 436 PetscCall(PetscFree(b->garray)); 437 PetscCall(VecDestroy(&b->lvec)); 438 PetscCall(VecScatterDestroy(&b->Mvctx)); 439 /* Because the B will have been resized we simply destroy it and create a new one each time */ 440 PetscCall(MatDestroy(&b->B)); 441 if (!b->A) { 442 PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 443 PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 444 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 445 } 446 if (!b->B) { 447 PetscMPIInt size; 448 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 449 PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 450 PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0)); 451 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 452 } 453 PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 454 PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 455 PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 456 PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 457 PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz)); 458 PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz)); 459 PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat)); 460 PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat)); 461 B->preallocated = PETSC_TRUE; 462 PetscFunctionReturn(0); 463 } 464 465 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 466 { 467 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 468 469 PetscFunctionBegin; 470 PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 471 PetscCall((*a->A->ops->mult)(a->A,xx,yy)); 472 PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 473 PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy)); 474 PetscFunctionReturn(0); 475 } 476 477 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 478 { 479 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 480 481 PetscFunctionBegin; 482 PetscCall(MatZeroEntries(l->A)); 483 PetscCall(MatZeroEntries(l->B)); 484 PetscFunctionReturn(0); 485 } 486 487 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 488 { 489 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 490 491 PetscFunctionBegin; 492 PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 493 PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz)); 494 PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 495 PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz)); 496 PetscFunctionReturn(0); 497 } 498 499 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 500 { 501 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 502 503 PetscFunctionBegin; 504 PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec)); 505 PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy)); 506 PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 507 PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 508 PetscFunctionReturn(0); 509 } 510 511 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 512 { 513 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 514 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 515 516 PetscFunctionBegin; 517 switch (op) { 518 case MAT_CUSPARSE_MULT_DIAG: 519 cusparseStruct->diagGPUMatFormat = format; 520 break; 521 case MAT_CUSPARSE_MULT_OFFDIAG: 522 cusparseStruct->offdiagGPUMatFormat = format; 523 break; 524 case MAT_CUSPARSE_ALL: 525 cusparseStruct->diagGPUMatFormat = format; 526 cusparseStruct->offdiagGPUMatFormat = format; 527 break; 528 default: 529 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 530 } 531 PetscFunctionReturn(0); 532 } 533 534 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 535 { 536 MatCUSPARSEStorageFormat format; 537 PetscBool flg; 538 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 539 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 540 541 PetscFunctionBegin; 542 PetscOptionsHeadBegin(PetscOptionsObject,"MPIAIJCUSPARSE options"); 543 if (A->factortype==MAT_FACTOR_NONE) { 544 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 545 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 546 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format)); 547 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 548 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg)); 549 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format)); 550 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 551 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 552 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 553 } 554 PetscOptionsHeadEnd(); 555 PetscFunctionReturn(0); 556 } 557 558 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 559 { 560 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 561 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 562 PetscObjectState onnz = A->nonzerostate; 563 564 PetscFunctionBegin; 565 PetscCall(MatAssemblyEnd_MPIAIJ(A,mode)); 566 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA)); 567 if (onnz != A->nonzerostate && cusp->deviceMat) { 568 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 569 570 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 571 PetscCall(PetscNew(&h_mat)); 572 PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 573 PetscCallCUDA(cudaFree(h_mat->colmap)); 574 if (h_mat->allocated_indices) { 575 PetscCallCUDA(cudaFree(h_mat->diag.i)); 576 PetscCallCUDA(cudaFree(h_mat->diag.j)); 577 if (h_mat->offdiag.j) { 578 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 579 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 580 } 581 } 582 PetscCallCUDA(cudaFree(d_mat)); 583 PetscCall(PetscFree(h_mat)); 584 cusp->deviceMat = NULL; 585 } 586 PetscFunctionReturn(0); 587 } 588 589 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 590 { 591 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 592 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 593 594 PetscFunctionBegin; 595 PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 596 if (cusparseStruct->deviceMat) { 597 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 598 599 PetscCall(PetscInfo(A,"Have device matrix\n")); 600 PetscCall(PetscNew(&h_mat)); 601 PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 602 PetscCallCUDA(cudaFree(h_mat->colmap)); 603 if (h_mat->allocated_indices) { 604 PetscCallCUDA(cudaFree(h_mat->diag.i)); 605 PetscCallCUDA(cudaFree(h_mat->diag.j)); 606 if (h_mat->offdiag.j) { 607 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 608 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 609 } 610 } 611 PetscCallCUDA(cudaFree(d_mat)); 612 PetscCall(PetscFree(h_mat)); 613 } 614 /* Free COO */ 615 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 616 PetscCallCXX(delete cusparseStruct); 617 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL)); 618 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL)); 619 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 620 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 621 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 622 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL)); 623 PetscCall(MatDestroy_MPIAIJ(A)); 624 PetscFunctionReturn(0); 625 } 626 627 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 628 { 629 Mat_MPIAIJ *a; 630 Mat A; 631 632 PetscFunctionBegin; 633 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 634 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat)); 635 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN)); 636 A = *newmat; 637 A->boundtocpu = PETSC_FALSE; 638 PetscCall(PetscFree(A->defaultvectype)); 639 PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 640 641 a = (Mat_MPIAIJ*)A->data; 642 if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE)); 643 if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE)); 644 if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA)); 645 646 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 647 PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 648 } 649 650 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 651 A->ops->mult = MatMult_MPIAIJCUSPARSE; 652 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 653 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 654 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 655 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 656 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 657 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 658 659 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE)); 660 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 661 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 662 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 663 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE)); 664 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE)); 665 #if defined(PETSC_HAVE_HYPRE) 666 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 667 #endif 668 PetscFunctionReturn(0); 669 } 670 671 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 672 { 673 PetscFunctionBegin; 674 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 675 PetscCall(MatCreate_MPIAIJ(A)); 676 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A)); 677 PetscFunctionReturn(0); 678 } 679 680 /*@ 681 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 682 (the default parallel PETSc format). This matrix will ultimately pushed down 683 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 684 assembly performance the user should preallocate the matrix storage by setting 685 the parameter nz (or the array nnz). By setting these parameters accurately, 686 performance during matrix assembly can be increased by more than a factor of 50. 687 688 Collective 689 690 Input Parameters: 691 + comm - MPI communicator, set to PETSC_COMM_SELF 692 . m - number of rows 693 . n - number of columns 694 . nz - number of nonzeros per row (same for all rows) 695 - nnz - array containing the number of nonzeros in the various rows 696 (possibly different for each row) or NULL 697 698 Output Parameter: 699 . A - the matrix 700 701 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 702 MatXXXXSetPreallocation() paradigm instead of this routine directly. 703 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 704 705 Notes: 706 If nnz is given then nz is ignored 707 708 The AIJ format (also called the Yale sparse matrix format or 709 compressed row storage), is fully compatible with standard Fortran 77 710 storage. That is, the stored row and column indices can begin at 711 either one (as in Fortran) or zero. See the users' manual for details. 712 713 Specify the preallocated storage with either nz or nnz (not both). 714 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 715 allocation. For large problems you MUST preallocate memory or you 716 will get TERRIBLE performance, see the users' manual chapter on matrices. 717 718 By default, this format uses inodes (identical nodes) when possible, to 719 improve numerical efficiency of matrix-vector products and solves. We 720 search for consecutive rows with the same nonzero structure, thereby 721 reusing matrix information to achieve increased efficiency. 722 723 Level: intermediate 724 725 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 726 @*/ 727 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 728 { 729 PetscMPIInt size; 730 731 PetscFunctionBegin; 732 PetscCall(MatCreate(comm,A)); 733 PetscCall(MatSetSizes(*A,m,n,M,N)); 734 PetscCallMPI(MPI_Comm_size(comm,&size)); 735 if (size > 1) { 736 PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE)); 737 PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz)); 738 } else { 739 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 740 PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz)); 741 } 742 PetscFunctionReturn(0); 743 } 744 745 /*MC 746 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 747 748 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 749 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 750 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 751 752 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 753 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 754 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 755 for communicators controlling multiple processes. It is recommended that you call both of 756 the above preallocation routines for simplicity. 757 758 Options Database Keys: 759 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 760 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 761 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 762 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 763 764 Level: beginner 765 766 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 767 M*/ 768 769 /*MC 770 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 771 772 Level: beginner 773 774 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 775 M*/ 776 777 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 778 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 779 { 780 PetscSplitCSRDataStructure d_mat; 781 PetscMPIInt size; 782 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 783 PetscScalar *aa = NULL,*ba = NULL; 784 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 785 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 786 CsrMatrix *matrixA = NULL,*matrixB = NULL; 787 788 PetscFunctionBegin; 789 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 790 if (A->factortype != MAT_FACTOR_NONE) { 791 *B = NULL; 792 PetscFunctionReturn(0); 793 } 794 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 795 // get jaca 796 if (size == 1) { 797 PetscBool isseqaij; 798 799 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij)); 800 if (isseqaij) { 801 jaca = (Mat_SeqAIJ*)A->data; 802 PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 803 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 804 d_mat = cusparsestructA->deviceMat; 805 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 806 } else { 807 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 808 PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 809 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 810 jaca = (Mat_SeqAIJ*)aij->A->data; 811 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 812 d_mat = spptr->deviceMat; 813 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 814 } 815 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 816 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 817 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 818 matrixA = (CsrMatrix*)matstruct->mat; 819 bi = NULL; 820 bj = NULL; 821 ba = NULL; 822 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 823 } else { 824 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 825 PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 826 jaca = (Mat_SeqAIJ*)aij->A->data; 827 jacb = (Mat_SeqAIJ*)aij->B->data; 828 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 829 830 PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 831 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 832 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 833 PetscCheck(cusparsestructA->format==MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 834 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 835 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 836 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 837 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 838 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 839 PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 840 PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 841 matrixA = (CsrMatrix*)matstructA->mat; 842 matrixB = (CsrMatrix*)matstructB->mat; 843 if (jacb->compressedrow.use) { 844 if (!cusparsestructB->rowoffsets_gpu) { 845 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 846 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 847 } 848 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 849 } else { 850 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 851 } 852 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 853 ba = thrust::raw_pointer_cast(matrixB->values->data()); 854 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 855 d_mat = spptr->deviceMat; 856 } 857 if (jaca->compressedrow.use) { 858 if (!cusparsestructA->rowoffsets_gpu) { 859 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 860 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 861 } 862 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 863 } else { 864 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 865 } 866 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 867 aa = thrust::raw_pointer_cast(matrixA->values->data()); 868 869 if (!d_mat) { 870 PetscSplitCSRDataStructure h_mat; 871 872 // create and populate strucy on host and copy on device 873 PetscCall(PetscInfo(A,"Create device matrix\n")); 874 PetscCall(PetscNew(&h_mat)); 875 PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat))); 876 if (size > 1) { /* need the colmap array */ 877 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 878 PetscInt *colmap; 879 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 880 881 PetscCheck(!n || aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 882 883 PetscCall(PetscCalloc1(N+1,&colmap)); 884 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 885 #if defined(PETSC_USE_64BIT_INDICES) 886 { // have to make a long version of these 887 int *h_bi32, *h_bj32; 888 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 889 PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64)); 890 PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost)); 891 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 892 PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost)); 893 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 894 895 PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64))); 896 PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice)); 897 PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64))); 898 PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice)); 899 900 h_mat->offdiag.i = d_bi64; 901 h_mat->offdiag.j = d_bj64; 902 h_mat->allocated_indices = PETSC_TRUE; 903 904 PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64)); 905 } 906 #else 907 h_mat->offdiag.i = (PetscInt*)bi; 908 h_mat->offdiag.j = (PetscInt*)bj; 909 h_mat->allocated_indices = PETSC_FALSE; 910 #endif 911 h_mat->offdiag.a = ba; 912 h_mat->offdiag.n = A->rmap->n; 913 914 PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap))); 915 PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice)); 916 PetscCall(PetscFree(colmap)); 917 } 918 h_mat->rstart = A->rmap->rstart; 919 h_mat->rend = A->rmap->rend; 920 h_mat->cstart = A->cmap->rstart; 921 h_mat->cend = A->cmap->rend; 922 h_mat->M = A->cmap->N; 923 #if defined(PETSC_USE_64BIT_INDICES) 924 { 925 int *h_ai32, *h_aj32; 926 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 927 928 static_assert(sizeof(PetscInt) == 8,""); 929 PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64)); 930 PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost)); 931 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 932 PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost)); 933 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 934 935 PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64))); 936 PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice)); 937 PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64))); 938 PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice)); 939 940 h_mat->diag.i = d_ai64; 941 h_mat->diag.j = d_aj64; 942 h_mat->allocated_indices = PETSC_TRUE; 943 944 PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64)); 945 } 946 #else 947 h_mat->diag.i = (PetscInt*)ai; 948 h_mat->diag.j = (PetscInt*)aj; 949 h_mat->allocated_indices = PETSC_FALSE; 950 #endif 951 h_mat->diag.a = aa; 952 h_mat->diag.n = A->rmap->n; 953 h_mat->rank = PetscGlobalRank; 954 // copy pointers and metadata to device 955 PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice)); 956 PetscCall(PetscFree(h_mat)); 957 } else { 958 PetscCall(PetscInfo(A,"Reusing device matrix\n")); 959 } 960 *B = d_mat; 961 A->offloadmask = PETSC_OFFLOAD_GPU; 962 PetscFunctionReturn(0); 963 } 964