1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 25 { 26 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; 27 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 28 29 PetscFunctionBegin; 30 if (!cusparseStruct) PetscFunctionReturn(0); 31 if (cusparseStruct->use_extended_coo) { 32 CHKERRCUDA(cudaFree(cusparseStruct->Aimap1_d)); 33 CHKERRCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 34 CHKERRCUDA(cudaFree(cusparseStruct->Aperm1_d)); 35 CHKERRCUDA(cudaFree(cusparseStruct->Bimap1_d)); 36 CHKERRCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 37 CHKERRCUDA(cudaFree(cusparseStruct->Bperm1_d)); 38 CHKERRCUDA(cudaFree(cusparseStruct->Aimap2_d)); 39 CHKERRCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 40 CHKERRCUDA(cudaFree(cusparseStruct->Aperm2_d)); 41 CHKERRCUDA(cudaFree(cusparseStruct->Bimap2_d)); 42 CHKERRCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 43 CHKERRCUDA(cudaFree(cusparseStruct->Bperm2_d)); 44 CHKERRCUDA(cudaFree(cusparseStruct->Cperm1_d)); 45 CHKERRCUDA(cudaFree(cusparseStruct->sendbuf_d)); 46 CHKERRCUDA(cudaFree(cusparseStruct->recvbuf_d)); 47 } 48 cusparseStruct->use_extended_coo = PETSC_FALSE; 49 delete cusparseStruct->coo_p; 50 delete cusparseStruct->coo_pw; 51 cusparseStruct->coo_p = NULL; 52 cusparseStruct->coo_pw = NULL; 53 PetscFunctionReturn(0); 54 } 55 56 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 57 { 58 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 59 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 60 PetscInt n = cusp->coo_nd + cusp->coo_no; 61 62 PetscFunctionBegin; 63 if (cusp->coo_p && v) { 64 thrust::device_ptr<const PetscScalar> d_v; 65 THRUSTARRAY *w = NULL; 66 67 if (isCudaMem(v)) { 68 d_v = thrust::device_pointer_cast(v); 69 } else { 70 w = new THRUSTARRAY(n); 71 w->assign(v,v+n); 72 CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 73 d_v = w->data(); 74 } 75 76 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 77 cusp->coo_pw->begin())); 78 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 79 cusp->coo_pw->end())); 80 CHKERRQ(PetscLogGpuTimeBegin()); 81 thrust::for_each(zibit,zieit,VecCUDAEquals()); 82 CHKERRQ(PetscLogGpuTimeEnd()); 83 delete w; 84 CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode)); 85 CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode)); 86 } else { 87 CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode)); 88 CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode)); 89 } 90 PetscFunctionReturn(0); 91 } 92 93 template <typename Tuple> 94 struct IsNotOffDiagT 95 { 96 PetscInt _cstart,_cend; 97 98 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 99 __host__ __device__ 100 inline bool operator()(Tuple t) 101 { 102 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 103 } 104 }; 105 106 struct IsOffDiag 107 { 108 PetscInt _cstart,_cend; 109 110 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 111 __host__ __device__ 112 inline bool operator() (const PetscInt &c) 113 { 114 return c < _cstart || c >= _cend; 115 } 116 }; 117 118 struct GlobToLoc 119 { 120 PetscInt _start; 121 122 GlobToLoc(PetscInt start) : _start(start) {} 123 __host__ __device__ 124 inline PetscInt operator() (const PetscInt &c) 125 { 126 return c - _start; 127 } 128 }; 129 130 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 131 { 132 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 133 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 134 PetscInt N,*jj; 135 size_t noff = 0; 136 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 137 THRUSTINTARRAY d_j(n); 138 ISLocalToGlobalMapping l2g; 139 140 PetscFunctionBegin; 141 if (b->A) CHKERRQ(MatCUSPARSEClearHandle(b->A)); 142 if (b->B) CHKERRQ(MatCUSPARSEClearHandle(b->B)); 143 CHKERRQ(MatDestroy(&b->A)); 144 CHKERRQ(MatDestroy(&b->B)); 145 146 CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 147 d_i.assign(coo_i,coo_i+n); 148 d_j.assign(coo_j,coo_j+n); 149 CHKERRQ(PetscLogGpuTimeBegin()); 150 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 151 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 152 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 153 cusp->coo_p = new THRUSTINTARRAY(n); 154 cusp->coo_pw = new THRUSTARRAY(n); 155 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 156 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 157 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 158 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 159 firstoffd = mzipp.get_iterator_tuple().get<1>(); 160 } 161 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 162 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 163 164 /* from global to local */ 165 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 166 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 167 CHKERRQ(PetscLogGpuTimeEnd()); 168 169 /* copy offdiag column indices to map on the CPU */ 170 CHKERRQ(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */ 171 CHKERRCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 172 auto o_j = d_j.begin(); 173 CHKERRQ(PetscLogGpuTimeBegin()); 174 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 175 thrust::sort(thrust::device,o_j,d_j.end()); 176 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 177 CHKERRQ(PetscLogGpuTimeEnd()); 178 noff = thrust::distance(o_j,wit); 179 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 180 CHKERRQ(PetscMalloc1(noff,&b->garray)); 181 CHKERRCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 182 CHKERRQ(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt))); 183 CHKERRQ(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g)); 184 CHKERRQ(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH)); 185 CHKERRQ(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj)); 186 PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 187 CHKERRQ(ISLocalToGlobalMappingDestroy(&l2g)); 188 189 CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->A)); 190 CHKERRQ(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 191 CHKERRQ(MatSetType(b->A,MATSEQAIJCUSPARSE)); 192 CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 193 CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->B)); 194 CHKERRQ(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff)); 195 CHKERRQ(MatSetType(b->B,MATSEQAIJCUSPARSE)); 196 CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 197 198 /* GPU memory, cusparse specific call handles it internally */ 199 CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get())); 200 CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj)); 201 CHKERRQ(PetscFree(jj)); 202 203 CHKERRQ(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat)); 204 CHKERRQ(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat)); 205 CHKERRQ(MatCUSPARSESetHandle(b->A,cusp->handle)); 206 CHKERRQ(MatCUSPARSESetHandle(b->B,cusp->handle)); 207 /* 208 CHKERRQ(MatCUSPARSESetStream(b->A,cusp->stream)); 209 CHKERRQ(MatCUSPARSESetStream(b->B,cusp->stream)); 210 */ 211 212 CHKERRQ(MatBindToCPU(b->A,B->boundtocpu)); 213 CHKERRQ(MatBindToCPU(b->B,B->boundtocpu)); 214 CHKERRQ(MatSetUpMultiply_MPIAIJ(B)); 215 PetscFunctionReturn(0); 216 } 217 218 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 219 { 220 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 221 Mat_MPIAIJCUSPARSE *mpidev; 222 PetscBool coo_basic = PETSC_TRUE; 223 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 224 PetscInt rstart,rend; 225 226 PetscFunctionBegin; 227 CHKERRQ(PetscFree(mpiaij->garray)); 228 CHKERRQ(VecDestroy(&mpiaij->lvec)); 229 #if defined(PETSC_USE_CTABLE) 230 CHKERRQ(PetscTableDestroy(&mpiaij->colmap)); 231 #else 232 CHKERRQ(PetscFree(mpiaij->colmap)); 233 #endif 234 CHKERRQ(VecScatterDestroy(&mpiaij->Mvctx)); 235 mat->assembled = PETSC_FALSE; 236 mat->was_assembled = PETSC_FALSE; 237 CHKERRQ(MatResetPreallocationCOO_MPIAIJ(mat)); 238 CHKERRQ(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 239 if (coo_i) { 240 CHKERRQ(PetscLayoutGetRange(mat->rmap,&rstart,&rend)); 241 CHKERRQ(PetscGetMemType(coo_i,&mtype)); 242 if (PetscMemTypeHost(mtype)) { 243 for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 244 if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;} 245 } 246 } 247 } 248 /* All ranks must agree on the value of coo_basic */ 249 CHKERRMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat))); 250 if (coo_basic) { 251 CHKERRQ(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 252 } else { 253 CHKERRQ(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j)); 254 mat->offloadmask = PETSC_OFFLOAD_CPU; 255 /* creates the GPU memory */ 256 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 257 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 258 mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 259 mpidev->use_extended_coo = PETSC_TRUE; 260 261 CHKERRCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount))); 262 CHKERRCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount))); 263 CHKERRCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount))); 264 265 CHKERRCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount))); 266 CHKERRCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount))); 267 CHKERRCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount))); 268 269 CHKERRCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount))); 270 CHKERRCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount))); 271 CHKERRCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount))); 272 273 CHKERRCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount))); 274 CHKERRCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount))); 275 CHKERRCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount))); 276 277 CHKERRCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount))); 278 CHKERRCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar))); 279 CHKERRCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar))); 280 281 CHKERRCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 282 CHKERRCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 283 CHKERRCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 284 285 CHKERRCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 286 CHKERRCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 287 CHKERRCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 288 289 CHKERRCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 290 CHKERRCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 291 CHKERRCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 292 293 CHKERRCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 294 CHKERRCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 295 CHKERRCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 296 297 CHKERRCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice)); 298 } 299 PetscFunctionReturn(0); 300 } 301 302 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 303 { 304 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 305 const PetscCount grid_size = gridDim.x * blockDim.x; 306 for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 307 } 308 309 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 310 { 311 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 312 const PetscCount grid_size = gridDim.x * blockDim.x; 313 for (; i<nnz; i += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 314 } 315 316 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 317 { 318 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 319 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 320 Mat A = mpiaij->A,B = mpiaij->B; 321 PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 322 PetscScalar *Aa,*Ba = NULL; 323 PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 324 const PetscScalar *v1 = v; 325 const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 326 const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 327 const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 328 const PetscCount *Cperm1 = mpidev->Cperm1_d; 329 PetscMemType memtype; 330 331 PetscFunctionBegin; 332 if (mpidev->use_extended_coo) { 333 PetscMPIInt size; 334 335 CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size)); 336 CHKERRQ(PetscGetMemType(v,&memtype)); 337 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 338 CHKERRCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar))); 339 CHKERRCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 340 } 341 342 if (imode == INSERT_VALUES) { 343 CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */ 344 CHKERRCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar))); 345 if (size > 1) { 346 CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba)); 347 CHKERRCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar))); 348 } 349 } else { 350 CHKERRQ(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */ 351 if (size > 1) CHKERRQ(MatSeqAIJCUSPARSEGetArray(B,&Ba)); 352 } 353 354 /* Pack entries to be sent to remote */ 355 if (mpiaij->sendlen) { 356 MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 357 CHKERRCUDA(cudaPeekAtLastError()); 358 } 359 360 /* Send remote entries to their owner and overlap the communication with local computation */ 361 if (size > 1) CHKERRQ(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE)); 362 /* Add local entries to A and B */ 363 if (Annz1) { 364 MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 365 CHKERRCUDA(cudaPeekAtLastError()); 366 } 367 if (Bnnz1) { 368 MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 369 CHKERRCUDA(cudaPeekAtLastError()); 370 } 371 if (size > 1) CHKERRQ(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE)); 372 373 /* Add received remote entries to A and B */ 374 if (Annz2) { 375 MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 376 CHKERRCUDA(cudaPeekAtLastError()); 377 } 378 if (Bnnz2) { 379 MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 380 CHKERRCUDA(cudaPeekAtLastError()); 381 } 382 383 if (imode == INSERT_VALUES) { 384 CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 385 if (size > 1) CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba)); 386 } else { 387 CHKERRQ(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 388 if (size > 1) CHKERRQ(MatSeqAIJCUSPARSERestoreArray(B,&Ba)); 389 } 390 if (PetscMemTypeHost(memtype)) CHKERRCUDA(cudaFree((void*)v1)); 391 } else { 392 CHKERRQ(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode)); 393 } 394 mat->offloadmask = PETSC_OFFLOAD_GPU; 395 PetscFunctionReturn(0); 396 } 397 398 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 399 { 400 Mat Ad,Ao; 401 const PetscInt *cmap; 402 403 PetscFunctionBegin; 404 CHKERRQ(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap)); 405 CHKERRQ(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc)); 406 if (glob) { 407 PetscInt cst, i, dn, on, *gidx; 408 409 CHKERRQ(MatGetLocalSize(Ad,NULL,&dn)); 410 CHKERRQ(MatGetLocalSize(Ao,NULL,&on)); 411 CHKERRQ(MatGetOwnershipRangeColumn(A,&cst,NULL)); 412 CHKERRQ(PetscMalloc1(dn+on,&gidx)); 413 for (i = 0; i<dn; i++) gidx[i] = cst + i; 414 for (i = 0; i<on; i++) gidx[i+dn] = cmap[i]; 415 CHKERRQ(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob)); 416 } 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 421 { 422 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 423 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 424 PetscInt i; 425 426 PetscFunctionBegin; 427 CHKERRQ(PetscLayoutSetUp(B->rmap)); 428 CHKERRQ(PetscLayoutSetUp(B->cmap)); 429 if (PetscDefined(USE_DEBUG) && d_nnz) { 430 for (i=0; i<B->rmap->n; i++) { 431 PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 432 } 433 } 434 if (PetscDefined(USE_DEBUG) && o_nnz) { 435 for (i=0; i<B->rmap->n; i++) { 436 PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 437 } 438 } 439 #if defined(PETSC_USE_CTABLE) 440 CHKERRQ(PetscTableDestroy(&b->colmap)); 441 #else 442 CHKERRQ(PetscFree(b->colmap)); 443 #endif 444 CHKERRQ(PetscFree(b->garray)); 445 CHKERRQ(VecDestroy(&b->lvec)); 446 CHKERRQ(VecScatterDestroy(&b->Mvctx)); 447 /* Because the B will have been resized we simply destroy it and create a new one each time */ 448 CHKERRQ(MatDestroy(&b->B)); 449 if (!b->A) { 450 CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->A)); 451 CHKERRQ(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 452 CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 453 } 454 if (!b->B) { 455 PetscMPIInt size; 456 CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 457 CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->B)); 458 CHKERRQ(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0)); 459 CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 460 } 461 CHKERRQ(MatSetType(b->A,MATSEQAIJCUSPARSE)); 462 CHKERRQ(MatSetType(b->B,MATSEQAIJCUSPARSE)); 463 CHKERRQ(MatBindToCPU(b->A,B->boundtocpu)); 464 CHKERRQ(MatBindToCPU(b->B,B->boundtocpu)); 465 CHKERRQ(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz)); 466 CHKERRQ(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz)); 467 CHKERRQ(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat)); 468 CHKERRQ(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat)); 469 CHKERRQ(MatCUSPARSESetHandle(b->A,cusparseStruct->handle)); 470 CHKERRQ(MatCUSPARSESetHandle(b->B,cusparseStruct->handle)); 471 B->preallocated = PETSC_TRUE; 472 PetscFunctionReturn(0); 473 } 474 475 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 476 { 477 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 478 479 PetscFunctionBegin; 480 CHKERRQ(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 481 CHKERRQ((*a->A->ops->mult)(a->A,xx,yy)); 482 CHKERRQ(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 483 CHKERRQ((*a->B->ops->multadd)(a->B,a->lvec,yy,yy)); 484 PetscFunctionReturn(0); 485 } 486 487 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 488 { 489 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 490 491 PetscFunctionBegin; 492 CHKERRQ(MatZeroEntries(l->A)); 493 CHKERRQ(MatZeroEntries(l->B)); 494 PetscFunctionReturn(0); 495 } 496 497 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 498 { 499 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 500 501 PetscFunctionBegin; 502 CHKERRQ(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 503 CHKERRQ((*a->A->ops->multadd)(a->A,xx,yy,zz)); 504 CHKERRQ(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 505 CHKERRQ((*a->B->ops->multadd)(a->B,a->lvec,zz,zz)); 506 PetscFunctionReturn(0); 507 } 508 509 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 510 { 511 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 512 513 PetscFunctionBegin; 514 CHKERRQ((*a->B->ops->multtranspose)(a->B,xx,a->lvec)); 515 CHKERRQ((*a->A->ops->multtranspose)(a->A,xx,yy)); 516 CHKERRQ(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 517 CHKERRQ(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 518 PetscFunctionReturn(0); 519 } 520 521 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 522 { 523 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 524 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 525 526 PetscFunctionBegin; 527 switch (op) { 528 case MAT_CUSPARSE_MULT_DIAG: 529 cusparseStruct->diagGPUMatFormat = format; 530 break; 531 case MAT_CUSPARSE_MULT_OFFDIAG: 532 cusparseStruct->offdiagGPUMatFormat = format; 533 break; 534 case MAT_CUSPARSE_ALL: 535 cusparseStruct->diagGPUMatFormat = format; 536 cusparseStruct->offdiagGPUMatFormat = format; 537 break; 538 default: 539 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 540 } 541 PetscFunctionReturn(0); 542 } 543 544 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 545 { 546 MatCUSPARSEStorageFormat format; 547 PetscBool flg; 548 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 549 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 550 551 PetscFunctionBegin; 552 CHKERRQ(PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options")); 553 if (A->factortype==MAT_FACTOR_NONE) { 554 CHKERRQ(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 555 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 556 if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format)); 557 CHKERRQ(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 558 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg)); 559 if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format)); 560 CHKERRQ(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 561 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 562 if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 563 } 564 CHKERRQ(PetscOptionsTail()); 565 PetscFunctionReturn(0); 566 } 567 568 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 569 { 570 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 571 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 572 PetscObjectState onnz = A->nonzerostate; 573 574 PetscFunctionBegin; 575 CHKERRQ(MatAssemblyEnd_MPIAIJ(A,mode)); 576 if (mpiaij->lvec) CHKERRQ(VecSetType(mpiaij->lvec,VECSEQCUDA)); 577 if (onnz != A->nonzerostate && cusp->deviceMat) { 578 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 579 580 CHKERRQ(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 581 CHKERRQ(PetscNew(&h_mat)); 582 CHKERRCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 583 CHKERRCUDA(cudaFree(h_mat->colmap)); 584 if (h_mat->allocated_indices) { 585 CHKERRCUDA(cudaFree(h_mat->diag.i)); 586 CHKERRCUDA(cudaFree(h_mat->diag.j)); 587 if (h_mat->offdiag.j) { 588 CHKERRCUDA(cudaFree(h_mat->offdiag.i)); 589 CHKERRCUDA(cudaFree(h_mat->offdiag.j)); 590 } 591 } 592 CHKERRCUDA(cudaFree(d_mat)); 593 CHKERRQ(PetscFree(h_mat)); 594 cusp->deviceMat = NULL; 595 } 596 PetscFunctionReturn(0); 597 } 598 599 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 600 { 601 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 602 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 603 604 PetscFunctionBegin; 605 PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 606 if (cusparseStruct->deviceMat) { 607 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 608 609 CHKERRQ(PetscInfo(A,"Have device matrix\n")); 610 CHKERRQ(PetscNew(&h_mat)); 611 CHKERRCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 612 CHKERRCUDA(cudaFree(h_mat->colmap)); 613 if (h_mat->allocated_indices) { 614 CHKERRCUDA(cudaFree(h_mat->diag.i)); 615 CHKERRCUDA(cudaFree(h_mat->diag.j)); 616 if (h_mat->offdiag.j) { 617 CHKERRCUDA(cudaFree(h_mat->offdiag.i)); 618 CHKERRCUDA(cudaFree(h_mat->offdiag.j)); 619 } 620 } 621 CHKERRCUDA(cudaFree(d_mat)); 622 CHKERRQ(PetscFree(h_mat)); 623 } 624 try { 625 if (aij->A) CHKERRQ(MatCUSPARSEClearHandle(aij->A)); 626 if (aij->B) CHKERRQ(MatCUSPARSEClearHandle(aij->B)); 627 CHKERRCUSPARSE(cusparseDestroy(cusparseStruct->handle)); 628 /* We want cusparseStruct to use PetscDefaultCudaStream 629 if (cusparseStruct->stream) { 630 CHKERRCUDA(cudaStreamDestroy(cusparseStruct->stream)); 631 } 632 */ 633 /* Free COO */ 634 CHKERRQ(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 635 delete cusparseStruct; 636 } catch(char *ex) { 637 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 638 } 639 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL)); 640 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL)); 641 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 642 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 643 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 644 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL)); 645 CHKERRQ(MatDestroy_MPIAIJ(A)); 646 PetscFunctionReturn(0); 647 } 648 649 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 650 { 651 Mat_MPIAIJ *a; 652 Mat A; 653 654 PetscFunctionBegin; 655 CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 656 if (reuse == MAT_INITIAL_MATRIX) CHKERRQ(MatDuplicate(B,MAT_COPY_VALUES,newmat)); 657 else if (reuse == MAT_REUSE_MATRIX) CHKERRQ(MatCopy(B,*newmat,SAME_NONZERO_PATTERN)); 658 A = *newmat; 659 A->boundtocpu = PETSC_FALSE; 660 CHKERRQ(PetscFree(A->defaultvectype)); 661 CHKERRQ(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 662 663 a = (Mat_MPIAIJ*)A->data; 664 if (a->A) CHKERRQ(MatSetType(a->A,MATSEQAIJCUSPARSE)); 665 if (a->B) CHKERRQ(MatSetType(a->B,MATSEQAIJCUSPARSE)); 666 if (a->lvec) CHKERRQ(VecSetType(a->lvec,VECSEQCUDA)); 667 668 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 669 Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 670 CHKERRCUSPARSE(cusparseCreate(&(cusp->handle))); 671 a->spptr = cusp; 672 } 673 674 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 675 A->ops->mult = MatMult_MPIAIJCUSPARSE; 676 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 677 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 678 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 679 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 680 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 681 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 682 683 CHKERRQ(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE)); 684 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 685 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 686 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 687 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE)); 688 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE)); 689 #if defined(PETSC_HAVE_HYPRE) 690 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 691 #endif 692 PetscFunctionReturn(0); 693 } 694 695 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 696 { 697 PetscFunctionBegin; 698 CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 699 CHKERRQ(MatCreate_MPIAIJ(A)); 700 CHKERRQ(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A)); 701 PetscFunctionReturn(0); 702 } 703 704 /*@ 705 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 706 (the default parallel PETSc format). This matrix will ultimately pushed down 707 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 708 assembly performance the user should preallocate the matrix storage by setting 709 the parameter nz (or the array nnz). By setting these parameters accurately, 710 performance during matrix assembly can be increased by more than a factor of 50. 711 712 Collective 713 714 Input Parameters: 715 + comm - MPI communicator, set to PETSC_COMM_SELF 716 . m - number of rows 717 . n - number of columns 718 . nz - number of nonzeros per row (same for all rows) 719 - nnz - array containing the number of nonzeros in the various rows 720 (possibly different for each row) or NULL 721 722 Output Parameter: 723 . A - the matrix 724 725 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 726 MatXXXXSetPreallocation() paradigm instead of this routine directly. 727 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 728 729 Notes: 730 If nnz is given then nz is ignored 731 732 The AIJ format (also called the Yale sparse matrix format or 733 compressed row storage), is fully compatible with standard Fortran 77 734 storage. That is, the stored row and column indices can begin at 735 either one (as in Fortran) or zero. See the users' manual for details. 736 737 Specify the preallocated storage with either nz or nnz (not both). 738 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 739 allocation. For large problems you MUST preallocate memory or you 740 will get TERRIBLE performance, see the users' manual chapter on matrices. 741 742 By default, this format uses inodes (identical nodes) when possible, to 743 improve numerical efficiency of matrix-vector products and solves. We 744 search for consecutive rows with the same nonzero structure, thereby 745 reusing matrix information to achieve increased efficiency. 746 747 Level: intermediate 748 749 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 750 @*/ 751 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 752 { 753 PetscMPIInt size; 754 755 PetscFunctionBegin; 756 CHKERRQ(MatCreate(comm,A)); 757 CHKERRQ(MatSetSizes(*A,m,n,M,N)); 758 CHKERRMPI(MPI_Comm_size(comm,&size)); 759 if (size > 1) { 760 CHKERRQ(MatSetType(*A,MATMPIAIJCUSPARSE)); 761 CHKERRQ(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz)); 762 } else { 763 CHKERRQ(MatSetType(*A,MATSEQAIJCUSPARSE)); 764 CHKERRQ(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz)); 765 } 766 PetscFunctionReturn(0); 767 } 768 769 /*MC 770 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 771 772 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 773 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 774 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 775 776 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 777 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 778 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 779 for communicators controlling multiple processes. It is recommended that you call both of 780 the above preallocation routines for simplicity. 781 782 Options Database Keys: 783 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 784 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 785 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 786 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 787 788 Level: beginner 789 790 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 791 M*/ 792 793 /*MC 794 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 795 796 Level: beginner 797 798 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 799 M*/ 800 801 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 802 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 803 { 804 PetscSplitCSRDataStructure d_mat; 805 PetscMPIInt size; 806 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 807 PetscScalar *aa = NULL,*ba = NULL; 808 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 809 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 810 CsrMatrix *matrixA = NULL,*matrixB = NULL; 811 812 PetscFunctionBegin; 813 PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 814 if (A->factortype != MAT_FACTOR_NONE) { 815 *B = NULL; 816 PetscFunctionReturn(0); 817 } 818 CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 819 // get jaca 820 if (size == 1) { 821 PetscBool isseqaij; 822 823 CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij)); 824 if (isseqaij) { 825 jaca = (Mat_SeqAIJ*)A->data; 826 PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 827 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 828 d_mat = cusparsestructA->deviceMat; 829 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 830 } else { 831 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 832 PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 833 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 834 jaca = (Mat_SeqAIJ*)aij->A->data; 835 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 836 d_mat = spptr->deviceMat; 837 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 838 } 839 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 840 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 841 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 842 matrixA = (CsrMatrix*)matstruct->mat; 843 bi = NULL; 844 bj = NULL; 845 ba = NULL; 846 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 847 } else { 848 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 849 PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 850 jaca = (Mat_SeqAIJ*)aij->A->data; 851 jacb = (Mat_SeqAIJ*)aij->B->data; 852 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 853 854 PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 855 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 856 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 857 PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 858 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 859 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 860 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 861 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 862 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 863 PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 864 PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 865 matrixA = (CsrMatrix*)matstructA->mat; 866 matrixB = (CsrMatrix*)matstructB->mat; 867 if (jacb->compressedrow.use) { 868 if (!cusparsestructB->rowoffsets_gpu) { 869 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 870 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 871 } 872 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 873 } else { 874 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 875 } 876 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 877 ba = thrust::raw_pointer_cast(matrixB->values->data()); 878 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 879 d_mat = spptr->deviceMat; 880 } 881 if (jaca->compressedrow.use) { 882 if (!cusparsestructA->rowoffsets_gpu) { 883 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 884 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 885 } 886 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 887 } else { 888 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 889 } 890 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 891 aa = thrust::raw_pointer_cast(matrixA->values->data()); 892 893 if (!d_mat) { 894 PetscSplitCSRDataStructure h_mat; 895 896 // create and populate strucy on host and copy on device 897 CHKERRQ(PetscInfo(A,"Create device matrix\n")); 898 CHKERRQ(PetscNew(&h_mat)); 899 CHKERRCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat))); 900 if (size > 1) { /* need the colmap array */ 901 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 902 PetscInt *colmap; 903 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 904 905 PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 906 907 CHKERRQ(PetscCalloc1(N+1,&colmap)); 908 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 909 #if defined(PETSC_USE_64BIT_INDICES) 910 { // have to make a long version of these 911 int *h_bi32, *h_bj32; 912 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 913 CHKERRQ(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64)); 914 CHKERRCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost)); 915 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 916 CHKERRCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost)); 917 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 918 919 CHKERRCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64))); 920 CHKERRCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice)); 921 CHKERRCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64))); 922 CHKERRCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice)); 923 924 h_mat->offdiag.i = d_bi64; 925 h_mat->offdiag.j = d_bj64; 926 h_mat->allocated_indices = PETSC_TRUE; 927 928 CHKERRQ(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64)); 929 } 930 #else 931 h_mat->offdiag.i = (PetscInt*)bi; 932 h_mat->offdiag.j = (PetscInt*)bj; 933 h_mat->allocated_indices = PETSC_FALSE; 934 #endif 935 h_mat->offdiag.a = ba; 936 h_mat->offdiag.n = A->rmap->n; 937 938 CHKERRCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap))); 939 CHKERRCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice)); 940 CHKERRQ(PetscFree(colmap)); 941 } 942 h_mat->rstart = A->rmap->rstart; 943 h_mat->rend = A->rmap->rend; 944 h_mat->cstart = A->cmap->rstart; 945 h_mat->cend = A->cmap->rend; 946 h_mat->M = A->cmap->N; 947 #if defined(PETSC_USE_64BIT_INDICES) 948 { 949 PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 950 int *h_ai32, *h_aj32; 951 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 952 CHKERRQ(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64)); 953 CHKERRCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost)); 954 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 955 CHKERRCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost)); 956 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 957 958 CHKERRCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64))); 959 CHKERRCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice)); 960 CHKERRCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64))); 961 CHKERRCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice)); 962 963 h_mat->diag.i = d_ai64; 964 h_mat->diag.j = d_aj64; 965 h_mat->allocated_indices = PETSC_TRUE; 966 967 CHKERRQ(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64)); 968 } 969 #else 970 h_mat->diag.i = (PetscInt*)ai; 971 h_mat->diag.j = (PetscInt*)aj; 972 h_mat->allocated_indices = PETSC_FALSE; 973 #endif 974 h_mat->diag.a = aa; 975 h_mat->diag.n = A->rmap->n; 976 h_mat->rank = PetscGlobalRank; 977 // copy pointers and metadata to device 978 CHKERRCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice)); 979 CHKERRQ(PetscFree(h_mat)); 980 } else { 981 CHKERRQ(PetscInfo(A,"Reusing device matrix\n")); 982 } 983 *B = d_mat; 984 A->offloadmask = PETSC_OFFLOAD_GPU; 985 PetscFunctionReturn(0); 986 } 987