1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 25 { 26 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; 27 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 28 29 PetscFunctionBegin; 30 if (!cusparseStruct) PetscFunctionReturn(0); 31 if (cusparseStruct->use_extended_coo) { 32 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 41 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 42 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 43 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 44 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 45 } 46 cusparseStruct->use_extended_coo = PETSC_FALSE; 47 delete cusparseStruct->coo_p; 48 delete cusparseStruct->coo_pw; 49 cusparseStruct->coo_p = NULL; 50 cusparseStruct->coo_pw = NULL; 51 PetscFunctionReturn(0); 52 } 53 54 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 55 { 56 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 57 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 58 PetscInt n = cusp->coo_nd + cusp->coo_no; 59 60 PetscFunctionBegin; 61 if (cusp->coo_p && v) { 62 thrust::device_ptr<const PetscScalar> d_v; 63 THRUSTARRAY *w = NULL; 64 65 if (isCudaMem(v)) { 66 d_v = thrust::device_pointer_cast(v); 67 } else { 68 w = new THRUSTARRAY(n); 69 w->assign(v,v+n); 70 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 71 d_v = w->data(); 72 } 73 74 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 75 cusp->coo_pw->begin())); 76 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 77 cusp->coo_pw->end())); 78 PetscCall(PetscLogGpuTimeBegin()); 79 thrust::for_each(zibit,zieit,VecCUDAEquals()); 80 PetscCall(PetscLogGpuTimeEnd()); 81 delete w; 82 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode)); 83 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode)); 84 } else { 85 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode)); 86 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode)); 87 } 88 PetscFunctionReturn(0); 89 } 90 91 template <typename Tuple> 92 struct IsNotOffDiagT 93 { 94 PetscInt _cstart,_cend; 95 96 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 97 __host__ __device__ 98 inline bool operator()(Tuple t) 99 { 100 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 101 } 102 }; 103 104 struct IsOffDiag 105 { 106 PetscInt _cstart,_cend; 107 108 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 109 __host__ __device__ 110 inline bool operator() (const PetscInt &c) 111 { 112 return c < _cstart || c >= _cend; 113 } 114 }; 115 116 struct GlobToLoc 117 { 118 PetscInt _start; 119 120 GlobToLoc(PetscInt start) : _start(start) {} 121 __host__ __device__ 122 inline PetscInt operator() (const PetscInt &c) 123 { 124 return c - _start; 125 } 126 }; 127 128 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 129 { 130 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 131 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 132 PetscInt N,*jj; 133 size_t noff = 0; 134 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 135 THRUSTINTARRAY d_j(n); 136 ISLocalToGlobalMapping l2g; 137 138 PetscFunctionBegin; 139 PetscCall(MatDestroy(&b->A)); 140 PetscCall(MatDestroy(&b->B)); 141 142 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 143 d_i.assign(coo_i,coo_i+n); 144 d_j.assign(coo_j,coo_j+n); 145 PetscCall(PetscLogGpuTimeBegin()); 146 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 147 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 148 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 149 cusp->coo_p = new THRUSTINTARRAY(n); 150 cusp->coo_pw = new THRUSTARRAY(n); 151 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 152 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 153 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 154 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 155 firstoffd = mzipp.get_iterator_tuple().get<1>(); 156 } 157 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 158 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 159 160 /* from global to local */ 161 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 162 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 163 PetscCall(PetscLogGpuTimeEnd()); 164 165 /* copy offdiag column indices to map on the CPU */ 166 PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */ 167 PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 168 auto o_j = d_j.begin(); 169 PetscCall(PetscLogGpuTimeBegin()); 170 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 171 thrust::sort(thrust::device,o_j,d_j.end()); 172 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 173 PetscCall(PetscLogGpuTimeEnd()); 174 noff = thrust::distance(o_j,wit); 175 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 176 PetscCall(PetscMalloc1(noff,&b->garray)); 177 PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 178 PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt))); 179 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g)); 180 PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH)); 181 PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj)); 182 PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 183 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 184 185 PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 186 PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 187 PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 188 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 189 PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 190 PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff)); 191 PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 192 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 193 194 /* GPU memory, cusparse specific call handles it internally */ 195 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get())); 196 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj)); 197 PetscCall(PetscFree(jj)); 198 199 PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat)); 200 PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat)); 201 202 PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 203 PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 204 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 205 PetscFunctionReturn(0); 206 } 207 208 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 209 { 210 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 211 Mat_MPIAIJCUSPARSE *mpidev; 212 PetscBool coo_basic = PETSC_TRUE; 213 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 214 PetscInt rstart,rend; 215 216 PetscFunctionBegin; 217 PetscCall(PetscFree(mpiaij->garray)); 218 PetscCall(VecDestroy(&mpiaij->lvec)); 219 #if defined(PETSC_USE_CTABLE) 220 PetscCall(PetscTableDestroy(&mpiaij->colmap)); 221 #else 222 PetscCall(PetscFree(mpiaij->colmap)); 223 #endif 224 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 225 mat->assembled = PETSC_FALSE; 226 mat->was_assembled = PETSC_FALSE; 227 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 228 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 229 if (coo_i) { 230 PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend)); 231 PetscCall(PetscGetMemType(coo_i,&mtype)); 232 if (PetscMemTypeHost(mtype)) { 233 for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 234 if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;} 235 } 236 } 237 } 238 /* All ranks must agree on the value of coo_basic */ 239 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat))); 240 if (coo_basic) { 241 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 242 } else { 243 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j)); 244 mat->offloadmask = PETSC_OFFLOAD_CPU; 245 /* creates the GPU memory */ 246 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 247 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 248 mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 249 mpidev->use_extended_coo = PETSC_TRUE; 250 251 PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz+1)*sizeof(PetscCount))); 252 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount))); 253 254 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz+1)*sizeof(PetscCount))); 255 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount))); 256 257 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount))); 258 PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount))); 259 PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount))); 260 261 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount))); 262 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount))); 263 PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount))); 264 265 PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount))); 266 PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar))); 267 PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar))); 268 269 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 270 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 271 272 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 273 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 274 275 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 276 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 277 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 278 279 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 280 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 281 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 282 283 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice)); 284 } 285 PetscFunctionReturn(0); 286 } 287 288 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 289 { 290 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 291 const PetscCount grid_size = gridDim.x * blockDim.x; 292 for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 293 } 294 295 __global__ void MatAddLocalCOOValues(const PetscScalar kv[],InsertMode imode, 296 PetscCount Annz,const PetscCount Ajmap1[],const PetscCount Aperm1[],PetscScalar Aa[], 297 PetscCount Bnnz,const PetscCount Bjmap1[],const PetscCount Bperm1[],PetscScalar Ba[]) 298 { 299 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 300 const PetscCount grid_size = gridDim.x * blockDim.x; 301 for (; i<Annz+Bnnz; i+= grid_size) { 302 PetscScalar sum = 0.0; 303 if (i<Annz) { 304 for (PetscCount k=Ajmap1[i]; k<Ajmap1[i+1]; k++) sum += kv[Aperm1[k]]; 305 Aa[i] = (imode == INSERT_VALUES? 0.0 : Aa[i]) + sum; 306 } else { 307 i -= Annz; 308 for (PetscCount k=Bjmap1[i]; k<Bjmap1[i+1]; k++) sum += kv[Bperm1[k]]; 309 Ba[i] = (imode == INSERT_VALUES? 0.0 : Ba[i]) + sum; 310 } 311 } 312 } 313 314 __global__ void MatAddRemoteCOOValues(const PetscScalar kv[], 315 PetscCount Annz2,const PetscCount Aimap2[],const PetscCount Ajmap2[],const PetscCount Aperm2[],PetscScalar Aa[], 316 PetscCount Bnnz2,const PetscCount Bimap2[],const PetscCount Bjmap2[],const PetscCount Bperm2[],PetscScalar Ba[]) 317 { 318 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 319 const PetscCount grid_size = gridDim.x * blockDim.x; 320 for (; i<Annz2+Bnnz2; i+= grid_size) { 321 if (i<Annz2) { 322 for (PetscCount k=Ajmap2[i]; k<Ajmap2[i+1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 323 } else { 324 i -= Annz2; 325 for (PetscCount k=Bjmap2[i]; k<Bjmap2[i+1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 326 } 327 } 328 } 329 330 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 331 { 332 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 333 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 334 Mat A = mpiaij->A,B = mpiaij->B; 335 PetscCount Annz = mpiaij->Annz,Annz2 = mpiaij->Annz2,Bnnz = mpiaij->Bnnz,Bnnz2 = mpiaij->Bnnz2; 336 PetscScalar *Aa,*Ba = NULL; 337 PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 338 const PetscScalar *v1 = v; 339 const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap2 = mpidev->Aimap2_d; 340 const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap2 = mpidev->Bimap2_d; 341 const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 342 const PetscCount *Cperm1 = mpidev->Cperm1_d; 343 PetscMemType memtype; 344 345 PetscFunctionBegin; 346 if (mpidev->use_extended_coo) { 347 PetscMPIInt size; 348 349 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size)); 350 PetscCall(PetscGetMemType(v,&memtype)); 351 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 352 PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar))); 353 PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 354 } 355 356 if (imode == INSERT_VALUES) { 357 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */ 358 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba)); 359 } else { 360 PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */ 361 PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba)); 362 } 363 364 /* Pack entries to be sent to remote */ 365 if (mpiaij->sendlen) { 366 MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);PetscCallCUDA(cudaPeekAtLastError()); 367 } 368 369 /* Send remote entries to their owner and overlap the communication with local computation */ 370 PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE)); 371 /* Add local entries to A and B */ 372 if (Annz+Bnnz > 0) { 373 MatAddLocalCOOValues<<<(Annz+Bnnz+255)/256,256>>>(v1,imode,Annz,Ajmap1,Aperm1,Aa,Bnnz,Bjmap1,Bperm1,Ba); 374 PetscCallCUDA(cudaPeekAtLastError()); 375 } 376 PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE)); 377 378 /* Add received remote entries to A and B */ 379 if (Annz2+Bnnz2 > 0) { 380 MatAddRemoteCOOValues<<<(Annz2+Bnnz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 381 PetscCallCUDA(cudaPeekAtLastError()); 382 } 383 384 if (imode == INSERT_VALUES) { 385 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 386 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba)); 387 } else { 388 PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 389 PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba)); 390 } 391 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 392 } else { 393 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode)); 394 } 395 mat->offloadmask = PETSC_OFFLOAD_GPU; 396 PetscFunctionReturn(0); 397 } 398 399 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 400 { 401 Mat Ad,Ao; 402 const PetscInt *cmap; 403 404 PetscFunctionBegin; 405 PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap)); 406 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc)); 407 if (glob) { 408 PetscInt cst, i, dn, on, *gidx; 409 410 PetscCall(MatGetLocalSize(Ad,NULL,&dn)); 411 PetscCall(MatGetLocalSize(Ao,NULL,&on)); 412 PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL)); 413 PetscCall(PetscMalloc1(dn+on,&gidx)); 414 for (i = 0; i<dn; i++) gidx[i] = cst + i; 415 for (i = 0; i<on; i++) gidx[i+dn] = cmap[i]; 416 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob)); 417 } 418 PetscFunctionReturn(0); 419 } 420 421 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 422 { 423 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 424 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 425 PetscInt i; 426 427 PetscFunctionBegin; 428 PetscCall(PetscLayoutSetUp(B->rmap)); 429 PetscCall(PetscLayoutSetUp(B->cmap)); 430 if (PetscDefined(USE_DEBUG) && d_nnz) { 431 for (i=0; i<B->rmap->n; i++) { 432 PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 433 } 434 } 435 if (PetscDefined(USE_DEBUG) && o_nnz) { 436 for (i=0; i<B->rmap->n; i++) { 437 PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 438 } 439 } 440 #if defined(PETSC_USE_CTABLE) 441 PetscCall(PetscTableDestroy(&b->colmap)); 442 #else 443 PetscCall(PetscFree(b->colmap)); 444 #endif 445 PetscCall(PetscFree(b->garray)); 446 PetscCall(VecDestroy(&b->lvec)); 447 PetscCall(VecScatterDestroy(&b->Mvctx)); 448 /* Because the B will have been resized we simply destroy it and create a new one each time */ 449 PetscCall(MatDestroy(&b->B)); 450 if (!b->A) { 451 PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 452 PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 453 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 454 } 455 if (!b->B) { 456 PetscMPIInt size; 457 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 458 PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 459 PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0)); 460 PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 461 } 462 PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 463 PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 464 PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 465 PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 466 PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz)); 467 PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz)); 468 PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat)); 469 PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat)); 470 B->preallocated = PETSC_TRUE; 471 PetscFunctionReturn(0); 472 } 473 474 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 475 { 476 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 477 478 PetscFunctionBegin; 479 PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 480 PetscCall((*a->A->ops->mult)(a->A,xx,yy)); 481 PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 482 PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy)); 483 PetscFunctionReturn(0); 484 } 485 486 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 487 { 488 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 489 490 PetscFunctionBegin; 491 PetscCall(MatZeroEntries(l->A)); 492 PetscCall(MatZeroEntries(l->B)); 493 PetscFunctionReturn(0); 494 } 495 496 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 497 { 498 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 499 500 PetscFunctionBegin; 501 PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 502 PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz)); 503 PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 504 PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz)); 505 PetscFunctionReturn(0); 506 } 507 508 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 509 { 510 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 511 512 PetscFunctionBegin; 513 PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec)); 514 PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy)); 515 PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 516 PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 517 PetscFunctionReturn(0); 518 } 519 520 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 521 { 522 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 523 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 524 525 PetscFunctionBegin; 526 switch (op) { 527 case MAT_CUSPARSE_MULT_DIAG: 528 cusparseStruct->diagGPUMatFormat = format; 529 break; 530 case MAT_CUSPARSE_MULT_OFFDIAG: 531 cusparseStruct->offdiagGPUMatFormat = format; 532 break; 533 case MAT_CUSPARSE_ALL: 534 cusparseStruct->diagGPUMatFormat = format; 535 cusparseStruct->offdiagGPUMatFormat = format; 536 break; 537 default: 538 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 539 } 540 PetscFunctionReturn(0); 541 } 542 543 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 544 { 545 MatCUSPARSEStorageFormat format; 546 PetscBool flg; 547 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 548 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 549 550 PetscFunctionBegin; 551 PetscOptionsHeadBegin(PetscOptionsObject,"MPIAIJCUSPARSE options"); 552 if (A->factortype==MAT_FACTOR_NONE) { 553 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 554 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 555 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format)); 556 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 557 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg)); 558 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format)); 559 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 560 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 561 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 562 } 563 PetscOptionsHeadEnd(); 564 PetscFunctionReturn(0); 565 } 566 567 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 568 { 569 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 570 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 571 PetscObjectState onnz = A->nonzerostate; 572 573 PetscFunctionBegin; 574 PetscCall(MatAssemblyEnd_MPIAIJ(A,mode)); 575 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA)); 576 if (onnz != A->nonzerostate && cusp->deviceMat) { 577 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 578 579 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 580 PetscCall(PetscNew(&h_mat)); 581 PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 582 PetscCallCUDA(cudaFree(h_mat->colmap)); 583 if (h_mat->allocated_indices) { 584 PetscCallCUDA(cudaFree(h_mat->diag.i)); 585 PetscCallCUDA(cudaFree(h_mat->diag.j)); 586 if (h_mat->offdiag.j) { 587 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 588 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 589 } 590 } 591 PetscCallCUDA(cudaFree(d_mat)); 592 PetscCall(PetscFree(h_mat)); 593 cusp->deviceMat = NULL; 594 } 595 PetscFunctionReturn(0); 596 } 597 598 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 599 { 600 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 601 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 602 603 PetscFunctionBegin; 604 PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 605 if (cusparseStruct->deviceMat) { 606 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 607 608 PetscCall(PetscInfo(A,"Have device matrix\n")); 609 PetscCall(PetscNew(&h_mat)); 610 PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 611 PetscCallCUDA(cudaFree(h_mat->colmap)); 612 if (h_mat->allocated_indices) { 613 PetscCallCUDA(cudaFree(h_mat->diag.i)); 614 PetscCallCUDA(cudaFree(h_mat->diag.j)); 615 if (h_mat->offdiag.j) { 616 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 617 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 618 } 619 } 620 PetscCallCUDA(cudaFree(d_mat)); 621 PetscCall(PetscFree(h_mat)); 622 } 623 /* Free COO */ 624 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 625 PetscCallCXX(delete cusparseStruct); 626 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL)); 627 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL)); 628 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 629 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 630 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 631 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL)); 632 PetscCall(MatDestroy_MPIAIJ(A)); 633 PetscFunctionReturn(0); 634 } 635 636 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 637 { 638 Mat_MPIAIJ *a; 639 Mat A; 640 641 PetscFunctionBegin; 642 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 643 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat)); 644 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN)); 645 A = *newmat; 646 A->boundtocpu = PETSC_FALSE; 647 PetscCall(PetscFree(A->defaultvectype)); 648 PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 649 650 a = (Mat_MPIAIJ*)A->data; 651 if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE)); 652 if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE)); 653 if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA)); 654 655 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 656 PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 657 } 658 659 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 660 A->ops->mult = MatMult_MPIAIJCUSPARSE; 661 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 662 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 663 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 664 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 665 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 666 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 667 668 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE)); 669 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 670 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 671 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 672 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE)); 673 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE)); 674 #if defined(PETSC_HAVE_HYPRE) 675 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 676 #endif 677 PetscFunctionReturn(0); 678 } 679 680 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 681 { 682 PetscFunctionBegin; 683 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 684 PetscCall(MatCreate_MPIAIJ(A)); 685 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A)); 686 PetscFunctionReturn(0); 687 } 688 689 /*@ 690 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 691 (the default parallel PETSc format). This matrix will ultimately pushed down 692 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 693 assembly performance the user should preallocate the matrix storage by setting 694 the parameter nz (or the array nnz). By setting these parameters accurately, 695 performance during matrix assembly can be increased by more than a factor of 50. 696 697 Collective 698 699 Input Parameters: 700 + comm - MPI communicator, set to PETSC_COMM_SELF 701 . m - number of rows 702 . n - number of columns 703 . nz - number of nonzeros per row (same for all rows) 704 - nnz - array containing the number of nonzeros in the various rows 705 (possibly different for each row) or NULL 706 707 Output Parameter: 708 . A - the matrix 709 710 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 711 MatXXXXSetPreallocation() paradigm instead of this routine directly. 712 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 713 714 Notes: 715 If nnz is given then nz is ignored 716 717 The AIJ format (also called the Yale sparse matrix format or 718 compressed row storage), is fully compatible with standard Fortran 77 719 storage. That is, the stored row and column indices can begin at 720 either one (as in Fortran) or zero. See the users' manual for details. 721 722 Specify the preallocated storage with either nz or nnz (not both). 723 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 724 allocation. For large problems you MUST preallocate memory or you 725 will get TERRIBLE performance, see the users' manual chapter on matrices. 726 727 By default, this format uses inodes (identical nodes) when possible, to 728 improve numerical efficiency of matrix-vector products and solves. We 729 search for consecutive rows with the same nonzero structure, thereby 730 reusing matrix information to achieve increased efficiency. 731 732 Level: intermediate 733 734 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 735 @*/ 736 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 737 { 738 PetscMPIInt size; 739 740 PetscFunctionBegin; 741 PetscCall(MatCreate(comm,A)); 742 PetscCall(MatSetSizes(*A,m,n,M,N)); 743 PetscCallMPI(MPI_Comm_size(comm,&size)); 744 if (size > 1) { 745 PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE)); 746 PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz)); 747 } else { 748 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 749 PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz)); 750 } 751 PetscFunctionReturn(0); 752 } 753 754 /*MC 755 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 756 757 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 758 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 759 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 760 761 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 762 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 763 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 764 for communicators controlling multiple processes. It is recommended that you call both of 765 the above preallocation routines for simplicity. 766 767 Options Database Keys: 768 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 769 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 770 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 771 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 772 773 Level: beginner 774 775 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 776 M*/ 777 778 /*MC 779 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 780 781 Level: beginner 782 783 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 784 M*/ 785 786 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 787 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 788 { 789 PetscSplitCSRDataStructure d_mat; 790 PetscMPIInt size; 791 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 792 PetscScalar *aa = NULL,*ba = NULL; 793 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 794 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 795 CsrMatrix *matrixA = NULL,*matrixB = NULL; 796 797 PetscFunctionBegin; 798 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 799 if (A->factortype != MAT_FACTOR_NONE) { 800 *B = NULL; 801 PetscFunctionReturn(0); 802 } 803 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 804 // get jaca 805 if (size == 1) { 806 PetscBool isseqaij; 807 808 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij)); 809 if (isseqaij) { 810 jaca = (Mat_SeqAIJ*)A->data; 811 PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 812 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 813 d_mat = cusparsestructA->deviceMat; 814 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 815 } else { 816 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 817 PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 818 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 819 jaca = (Mat_SeqAIJ*)aij->A->data; 820 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 821 d_mat = spptr->deviceMat; 822 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 823 } 824 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 825 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 826 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 827 matrixA = (CsrMatrix*)matstruct->mat; 828 bi = NULL; 829 bj = NULL; 830 ba = NULL; 831 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 832 } else { 833 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 834 PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 835 jaca = (Mat_SeqAIJ*)aij->A->data; 836 jacb = (Mat_SeqAIJ*)aij->B->data; 837 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 838 839 PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 840 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 841 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 842 PetscCheck(cusparsestructA->format==MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 843 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 844 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 845 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 846 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 847 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 848 PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 849 PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 850 matrixA = (CsrMatrix*)matstructA->mat; 851 matrixB = (CsrMatrix*)matstructB->mat; 852 if (jacb->compressedrow.use) { 853 if (!cusparsestructB->rowoffsets_gpu) { 854 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 855 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 856 } 857 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 858 } else { 859 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 860 } 861 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 862 ba = thrust::raw_pointer_cast(matrixB->values->data()); 863 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 864 d_mat = spptr->deviceMat; 865 } 866 if (jaca->compressedrow.use) { 867 if (!cusparsestructA->rowoffsets_gpu) { 868 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 869 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 870 } 871 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 872 } else { 873 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 874 } 875 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 876 aa = thrust::raw_pointer_cast(matrixA->values->data()); 877 878 if (!d_mat) { 879 PetscSplitCSRDataStructure h_mat; 880 881 // create and populate strucy on host and copy on device 882 PetscCall(PetscInfo(A,"Create device matrix\n")); 883 PetscCall(PetscNew(&h_mat)); 884 PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat))); 885 if (size > 1) { /* need the colmap array */ 886 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 887 PetscInt *colmap; 888 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 889 890 PetscCheck(!n || aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 891 892 PetscCall(PetscCalloc1(N+1,&colmap)); 893 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 894 #if defined(PETSC_USE_64BIT_INDICES) 895 { // have to make a long version of these 896 int *h_bi32, *h_bj32; 897 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 898 PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64)); 899 PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost)); 900 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 901 PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost)); 902 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 903 904 PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64))); 905 PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice)); 906 PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64))); 907 PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice)); 908 909 h_mat->offdiag.i = d_bi64; 910 h_mat->offdiag.j = d_bj64; 911 h_mat->allocated_indices = PETSC_TRUE; 912 913 PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64)); 914 } 915 #else 916 h_mat->offdiag.i = (PetscInt*)bi; 917 h_mat->offdiag.j = (PetscInt*)bj; 918 h_mat->allocated_indices = PETSC_FALSE; 919 #endif 920 h_mat->offdiag.a = ba; 921 h_mat->offdiag.n = A->rmap->n; 922 923 PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap))); 924 PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice)); 925 PetscCall(PetscFree(colmap)); 926 } 927 h_mat->rstart = A->rmap->rstart; 928 h_mat->rend = A->rmap->rend; 929 h_mat->cstart = A->cmap->rstart; 930 h_mat->cend = A->cmap->rend; 931 h_mat->M = A->cmap->N; 932 #if defined(PETSC_USE_64BIT_INDICES) 933 { 934 int *h_ai32, *h_aj32; 935 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 936 937 static_assert(sizeof(PetscInt) == 8,""); 938 PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64)); 939 PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost)); 940 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 941 PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost)); 942 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 943 944 PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64))); 945 PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice)); 946 PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64))); 947 PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice)); 948 949 h_mat->diag.i = d_ai64; 950 h_mat->diag.j = d_aj64; 951 h_mat->allocated_indices = PETSC_TRUE; 952 953 PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64)); 954 } 955 #else 956 h_mat->diag.i = (PetscInt*)ai; 957 h_mat->diag.j = (PetscInt*)aj; 958 h_mat->allocated_indices = PETSC_FALSE; 959 #endif 960 h_mat->diag.a = aa; 961 h_mat->diag.n = A->rmap->n; 962 h_mat->rank = PetscGlobalRank; 963 // copy pointers and metadata to device 964 PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice)); 965 PetscCall(PetscFree(h_mat)); 966 } else { 967 PetscCall(PetscInfo(A,"Reusing device matrix\n")); 968 } 969 *B = d_mat; 970 A->offloadmask = PETSC_OFFLOAD_GPU; 971 PetscFunctionReturn(0); 972 } 973