1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 25 { 26 cudaError_t cerr; 27 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; 28 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 29 30 PetscFunctionBegin; 31 if (!cusparseStruct) PetscFunctionReturn(0); 32 if (cusparseStruct->use_extended_coo) { 33 cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr); 34 cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr); 35 cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr); 36 cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr); 37 cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr); 38 cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr); 39 cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr); 40 cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr); 41 cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr); 42 cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr); 43 cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr); 44 cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr); 45 cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr); 46 cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr); 47 cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr); 48 } 49 cusparseStruct->use_extended_coo = PETSC_FALSE; 50 delete cusparseStruct->coo_p; 51 delete cusparseStruct->coo_pw; 52 cusparseStruct->coo_p = NULL; 53 cusparseStruct->coo_pw = NULL; 54 PetscFunctionReturn(0); 55 } 56 57 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 58 { 59 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 60 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 61 PetscInt n = cusp->coo_nd + cusp->coo_no; 62 PetscErrorCode ierr; 63 64 PetscFunctionBegin; 65 if (cusp->coo_p && v) { 66 thrust::device_ptr<const PetscScalar> d_v; 67 THRUSTARRAY *w = NULL; 68 69 if (isCudaMem(v)) { 70 d_v = thrust::device_pointer_cast(v); 71 } else { 72 w = new THRUSTARRAY(n); 73 w->assign(v,v+n); 74 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 75 d_v = w->data(); 76 } 77 78 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 79 cusp->coo_pw->begin())); 80 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 81 cusp->coo_pw->end())); 82 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 83 thrust::for_each(zibit,zieit,VecCUDAEquals()); 84 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 85 delete w; 86 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 87 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 88 } else { 89 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr); 90 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 91 } 92 PetscFunctionReturn(0); 93 } 94 95 template <typename Tuple> 96 struct IsNotOffDiagT 97 { 98 PetscInt _cstart,_cend; 99 100 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 101 __host__ __device__ 102 inline bool operator()(Tuple t) 103 { 104 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 105 } 106 }; 107 108 struct IsOffDiag 109 { 110 PetscInt _cstart,_cend; 111 112 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 113 __host__ __device__ 114 inline bool operator() (const PetscInt &c) 115 { 116 return c < _cstart || c >= _cend; 117 } 118 }; 119 120 struct GlobToLoc 121 { 122 PetscInt _start; 123 124 GlobToLoc(PetscInt start) : _start(start) {} 125 __host__ __device__ 126 inline PetscInt operator() (const PetscInt &c) 127 { 128 return c - _start; 129 } 130 }; 131 132 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 133 { 134 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 135 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 136 PetscErrorCode ierr; 137 PetscInt N,*jj; 138 size_t noff = 0; 139 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 140 THRUSTINTARRAY d_j(n); 141 ISLocalToGlobalMapping l2g; 142 cudaError_t cerr; 143 144 PetscFunctionBegin; 145 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 146 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 147 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 148 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 149 150 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 151 d_i.assign(coo_i,coo_i+n); 152 d_j.assign(coo_j,coo_j+n); 153 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 154 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 155 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 156 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 157 cusp->coo_p = new THRUSTINTARRAY(n); 158 cusp->coo_pw = new THRUSTARRAY(n); 159 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 160 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 161 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 162 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 163 firstoffd = mzipp.get_iterator_tuple().get<1>(); 164 } 165 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 166 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 167 168 /* from global to local */ 169 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 170 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 171 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 172 173 /* copy offdiag column indices to map on the CPU */ 174 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 175 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 176 auto o_j = d_j.begin(); 177 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 178 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 179 thrust::sort(thrust::device,o_j,d_j.end()); 180 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 181 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 182 noff = thrust::distance(o_j,wit); 183 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 184 ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 185 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 186 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 187 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 188 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 189 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr); 190 PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 191 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 192 193 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 194 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 195 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 196 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 197 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 198 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 199 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 200 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 201 202 /* GPU memory, cusparse specific call handles it internally */ 203 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 204 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 205 ierr = PetscFree(jj);CHKERRQ(ierr); 206 207 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 208 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 209 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 210 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 211 /* 212 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 213 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 214 */ 215 216 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 217 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 218 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 219 PetscFunctionReturn(0); 220 } 221 222 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 223 { 224 PetscErrorCode ierr; 225 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 226 Mat_MPIAIJCUSPARSE *mpidev; 227 PetscBool coo_basic = PETSC_TRUE; 228 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 229 PetscInt rstart,rend; 230 cudaError_t cerr; 231 232 PetscFunctionBegin; 233 ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr); 234 ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr); 235 #if defined(PETSC_USE_CTABLE) 236 ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr); 237 #else 238 ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr); 239 #endif 240 ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr); 241 mat->assembled = PETSC_FALSE; 242 mat->was_assembled = PETSC_FALSE; 243 ierr = MatResetPreallocationCOO_MPIAIJ(mat);CHKERRQ(ierr); 244 ierr = MatResetPreallocationCOO_MPIAIJCUSPARSE(mat);CHKERRQ(ierr); 245 if (coo_i) { 246 ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 247 ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 248 if (PetscMemTypeHost(mtype)) { 249 for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 250 if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;} 251 } 252 } 253 } 254 /* All ranks must agree on the value of coo_basic */ 255 ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr); 256 if (coo_basic) { 257 ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 258 } else { 259 ierr = MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 260 mat->offloadmask = PETSC_OFFLOAD_CPU; 261 /* creates the GPU memory */ 262 ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);CHKERRQ(ierr); 263 ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);CHKERRQ(ierr); 264 mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 265 mpidev->use_extended_coo = PETSC_TRUE; 266 267 cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr); 268 cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 269 cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr); 270 271 cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr); 272 cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 273 cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr); 274 275 cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr); 276 cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 277 cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr); 278 279 cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr); 280 cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 281 cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr); 282 283 cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr); 284 cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 285 cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 286 287 cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 288 cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 289 cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 290 291 cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 292 cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 293 cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 294 295 cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 296 cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 297 cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 298 299 cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 300 cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 301 cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 302 303 cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 304 } 305 PetscFunctionReturn(0); 306 } 307 308 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 309 { 310 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 311 const PetscCount grid_size = gridDim.x * blockDim.x; 312 for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 313 } 314 315 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 316 { 317 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 318 const PetscCount grid_size = gridDim.x * blockDim.x; 319 for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 320 } 321 322 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 323 { 324 PetscErrorCode ierr; 325 cudaError_t cerr; 326 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 327 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 328 Mat A = mpiaij->A,B = mpiaij->B; 329 PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 330 PetscScalar *Aa,*Ba = NULL; 331 PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 332 const PetscScalar *v1 = v; 333 const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 334 const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 335 const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 336 const PetscCount *Cperm1 = mpidev->Cperm1_d; 337 PetscMemType memtype; 338 339 PetscFunctionBegin; 340 if (mpidev->use_extended_coo) { 341 PetscMPIInt size; 342 343 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr); 344 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 345 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 346 cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 347 cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 348 } 349 350 if (imode == INSERT_VALUES) { 351 ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 352 cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 353 if (size > 1) { 354 ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr); 355 cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 356 } 357 } else { 358 ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */ 359 if (size > 1) { ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); } 360 } 361 362 /* Pack entries to be sent to remote */ 363 if (mpiaij->sendlen) { 364 MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 365 CHKERRCUDA(cudaPeekAtLastError()); 366 } 367 368 /* Send remote entries to their owner and overlap the communication with local computation */ 369 if (size > 1) { ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); } 370 /* Add local entries to A and B */ 371 if (Annz1) { 372 MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 373 CHKERRCUDA(cudaPeekAtLastError()); 374 } 375 if (Bnnz1) { 376 MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 377 CHKERRCUDA(cudaPeekAtLastError()); 378 } 379 if (size > 1) { ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); } 380 381 /* Add received remote entries to A and B */ 382 if (Annz2) { 383 MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 384 CHKERRCUDA(cudaPeekAtLastError()); 385 } 386 if (Bnnz2) { 387 MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 388 CHKERRCUDA(cudaPeekAtLastError()); 389 } 390 391 if (imode == INSERT_VALUES) { 392 ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr); 393 if (size > 1) { ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); } 394 } else { 395 ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr); 396 if (size > 1) { ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); } 397 } 398 if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 399 } else { 400 ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr); 401 } 402 mat->offloadmask = PETSC_OFFLOAD_GPU; 403 PetscFunctionReturn(0); 404 } 405 406 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 407 { 408 Mat Ad,Ao; 409 const PetscInt *cmap; 410 PetscErrorCode ierr; 411 412 PetscFunctionBegin; 413 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 414 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 415 if (glob) { 416 PetscInt cst, i, dn, on, *gidx; 417 418 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 419 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 420 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 421 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 422 for (i=0; i<dn; i++) gidx[i] = cst + i; 423 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 424 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 425 } 426 PetscFunctionReturn(0); 427 } 428 429 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 430 { 431 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 432 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 433 PetscErrorCode ierr; 434 PetscInt i; 435 436 PetscFunctionBegin; 437 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 438 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 439 if (PetscDefined(USE_DEBUG) && d_nnz) { 440 for (i=0; i<B->rmap->n; i++) { 441 PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 442 } 443 } 444 if (PetscDefined(USE_DEBUG) && o_nnz) { 445 for (i=0; i<B->rmap->n; i++) { 446 PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 447 } 448 } 449 #if defined(PETSC_USE_CTABLE) 450 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 451 #else 452 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 453 #endif 454 ierr = PetscFree(b->garray);CHKERRQ(ierr); 455 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 456 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 457 /* Because the B will have been resized we simply destroy it and create a new one each time */ 458 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 459 if (!b->A) { 460 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 461 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 462 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 463 } 464 if (!b->B) { 465 PetscMPIInt size; 466 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 467 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 468 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 469 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 470 } 471 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 472 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 473 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 474 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 475 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 476 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 477 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 478 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 479 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 480 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 481 B->preallocated = PETSC_TRUE; 482 PetscFunctionReturn(0); 483 } 484 485 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 486 { 487 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 488 PetscErrorCode ierr; 489 490 PetscFunctionBegin; 491 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 492 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 493 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 494 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 499 { 500 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 501 PetscErrorCode ierr; 502 503 PetscFunctionBegin; 504 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 505 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 506 PetscFunctionReturn(0); 507 } 508 509 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 510 { 511 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 512 PetscErrorCode ierr; 513 514 PetscFunctionBegin; 515 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 516 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 517 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 518 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 519 PetscFunctionReturn(0); 520 } 521 522 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 523 { 524 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 525 PetscErrorCode ierr; 526 527 PetscFunctionBegin; 528 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 529 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 530 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 531 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 532 PetscFunctionReturn(0); 533 } 534 535 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 536 { 537 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 538 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 539 540 PetscFunctionBegin; 541 switch (op) { 542 case MAT_CUSPARSE_MULT_DIAG: 543 cusparseStruct->diagGPUMatFormat = format; 544 break; 545 case MAT_CUSPARSE_MULT_OFFDIAG: 546 cusparseStruct->offdiagGPUMatFormat = format; 547 break; 548 case MAT_CUSPARSE_ALL: 549 cusparseStruct->diagGPUMatFormat = format; 550 cusparseStruct->offdiagGPUMatFormat = format; 551 break; 552 default: 553 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 554 } 555 PetscFunctionReturn(0); 556 } 557 558 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 559 { 560 MatCUSPARSEStorageFormat format; 561 PetscErrorCode ierr; 562 PetscBool flg; 563 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 564 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 565 566 PetscFunctionBegin; 567 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 568 if (A->factortype==MAT_FACTOR_NONE) { 569 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 570 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 571 if (flg) { 572 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 573 } 574 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 575 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 576 if (flg) { 577 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 578 } 579 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 580 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 581 if (flg) { 582 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 583 } 584 } 585 ierr = PetscOptionsTail();CHKERRQ(ierr); 586 PetscFunctionReturn(0); 587 } 588 589 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 590 { 591 PetscErrorCode ierr; 592 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 593 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 594 PetscObjectState onnz = A->nonzerostate; 595 596 PetscFunctionBegin; 597 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 598 if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 599 if (onnz != A->nonzerostate && cusp->deviceMat) { 600 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 601 cudaError_t cerr; 602 603 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 604 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 605 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 606 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 607 if (h_mat->allocated_indices) { 608 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 609 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 610 if (h_mat->offdiag.j) { 611 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 612 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 613 } 614 } 615 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 616 ierr = PetscFree(h_mat);CHKERRQ(ierr); 617 cusp->deviceMat = NULL; 618 } 619 PetscFunctionReturn(0); 620 } 621 622 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 623 { 624 PetscErrorCode ierr; 625 cudaError_t cerr; 626 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 627 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 628 cusparseStatus_t stat; 629 630 PetscFunctionBegin; 631 PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 632 if (cusparseStruct->deviceMat) { 633 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 634 635 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 636 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 637 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 638 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 639 if (h_mat->allocated_indices) { 640 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 641 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 642 if (h_mat->offdiag.j) { 643 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 644 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 645 } 646 } 647 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 648 ierr = PetscFree(h_mat);CHKERRQ(ierr); 649 } 650 try { 651 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 652 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 653 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 654 /* We want cusparseStruct to use PetscDefaultCudaStream 655 if (cusparseStruct->stream) { 656 cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 657 } 658 */ 659 /* Free COO */ 660 ierr = MatResetPreallocationCOO_MPIAIJCUSPARSE(A);CHKERRQ(ierr); 661 delete cusparseStruct; 662 } catch(char *ex) { 663 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 664 } 665 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 666 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 667 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 668 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 669 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 670 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 671 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 672 PetscFunctionReturn(0); 673 } 674 675 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 676 { 677 PetscErrorCode ierr; 678 Mat_MPIAIJ *a; 679 cusparseStatus_t stat; 680 Mat A; 681 682 PetscFunctionBegin; 683 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 684 if (reuse == MAT_INITIAL_MATRIX) { 685 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 686 } else if (reuse == MAT_REUSE_MATRIX) { 687 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 688 } 689 A = *newmat; 690 A->boundtocpu = PETSC_FALSE; 691 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 692 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 693 694 a = (Mat_MPIAIJ*)A->data; 695 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 696 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 697 if (a->lvec) { 698 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 699 } 700 701 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 702 Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 703 stat = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat); 704 a->spptr = cusp; 705 } 706 707 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 708 A->ops->mult = MatMult_MPIAIJCUSPARSE; 709 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 710 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 711 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 712 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 713 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 714 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 715 716 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 717 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 718 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 719 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 720 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 721 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 722 #if defined(PETSC_HAVE_HYPRE) 723 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 724 #endif 725 PetscFunctionReturn(0); 726 } 727 728 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 729 { 730 PetscErrorCode ierr; 731 732 PetscFunctionBegin; 733 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 734 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 735 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 736 PetscFunctionReturn(0); 737 } 738 739 /*@ 740 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 741 (the default parallel PETSc format). This matrix will ultimately pushed down 742 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 743 assembly performance the user should preallocate the matrix storage by setting 744 the parameter nz (or the array nnz). By setting these parameters accurately, 745 performance during matrix assembly can be increased by more than a factor of 50. 746 747 Collective 748 749 Input Parameters: 750 + comm - MPI communicator, set to PETSC_COMM_SELF 751 . m - number of rows 752 . n - number of columns 753 . nz - number of nonzeros per row (same for all rows) 754 - nnz - array containing the number of nonzeros in the various rows 755 (possibly different for each row) or NULL 756 757 Output Parameter: 758 . A - the matrix 759 760 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 761 MatXXXXSetPreallocation() paradigm instead of this routine directly. 762 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 763 764 Notes: 765 If nnz is given then nz is ignored 766 767 The AIJ format (also called the Yale sparse matrix format or 768 compressed row storage), is fully compatible with standard Fortran 77 769 storage. That is, the stored row and column indices can begin at 770 either one (as in Fortran) or zero. See the users' manual for details. 771 772 Specify the preallocated storage with either nz or nnz (not both). 773 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 774 allocation. For large problems you MUST preallocate memory or you 775 will get TERRIBLE performance, see the users' manual chapter on matrices. 776 777 By default, this format uses inodes (identical nodes) when possible, to 778 improve numerical efficiency of matrix-vector products and solves. We 779 search for consecutive rows with the same nonzero structure, thereby 780 reusing matrix information to achieve increased efficiency. 781 782 Level: intermediate 783 784 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 785 @*/ 786 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 787 { 788 PetscErrorCode ierr; 789 PetscMPIInt size; 790 791 PetscFunctionBegin; 792 ierr = MatCreate(comm,A);CHKERRQ(ierr); 793 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 794 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 795 if (size > 1) { 796 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 797 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 798 } else { 799 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 800 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 801 } 802 PetscFunctionReturn(0); 803 } 804 805 /*MC 806 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 807 808 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 809 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 810 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 811 812 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 813 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 814 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 815 for communicators controlling multiple processes. It is recommended that you call both of 816 the above preallocation routines for simplicity. 817 818 Options Database Keys: 819 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 820 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 821 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 822 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 823 824 Level: beginner 825 826 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 827 M*/ 828 829 /*MC 830 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 831 832 Level: beginner 833 834 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 835 M*/ 836 837 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 838 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 839 { 840 PetscSplitCSRDataStructure d_mat; 841 PetscMPIInt size; 842 PetscErrorCode ierr; 843 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 844 PetscScalar *aa = NULL,*ba = NULL; 845 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 846 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 847 CsrMatrix *matrixA = NULL,*matrixB = NULL; 848 849 PetscFunctionBegin; 850 PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 851 if (A->factortype != MAT_FACTOR_NONE) { 852 *B = NULL; 853 PetscFunctionReturn(0); 854 } 855 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 856 // get jaca 857 if (size == 1) { 858 PetscBool isseqaij; 859 860 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 861 if (isseqaij) { 862 jaca = (Mat_SeqAIJ*)A->data; 863 PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 864 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 865 d_mat = cusparsestructA->deviceMat; 866 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 867 } else { 868 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 869 PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 870 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 871 jaca = (Mat_SeqAIJ*)aij->A->data; 872 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 873 d_mat = spptr->deviceMat; 874 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 875 } 876 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 877 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 878 PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 879 matrixA = (CsrMatrix*)matstruct->mat; 880 bi = NULL; 881 bj = NULL; 882 ba = NULL; 883 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 884 } else { 885 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 886 PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 887 jaca = (Mat_SeqAIJ*)aij->A->data; 888 jacb = (Mat_SeqAIJ*)aij->B->data; 889 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 890 891 PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 892 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 893 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 894 PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 895 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 896 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 897 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 898 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 899 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 900 PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 901 PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 902 matrixA = (CsrMatrix*)matstructA->mat; 903 matrixB = (CsrMatrix*)matstructB->mat; 904 if (jacb->compressedrow.use) { 905 if (!cusparsestructB->rowoffsets_gpu) { 906 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 907 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 908 } 909 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 910 } else { 911 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 912 } 913 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 914 ba = thrust::raw_pointer_cast(matrixB->values->data()); 915 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 916 d_mat = spptr->deviceMat; 917 } 918 if (jaca->compressedrow.use) { 919 if (!cusparsestructA->rowoffsets_gpu) { 920 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 921 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 922 } 923 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 924 } else { 925 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 926 } 927 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 928 aa = thrust::raw_pointer_cast(matrixA->values->data()); 929 930 if (!d_mat) { 931 cudaError_t cerr; 932 PetscSplitCSRDataStructure h_mat; 933 934 // create and populate strucy on host and copy on device 935 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 936 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 937 cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 938 if (size > 1) { /* need the colmap array */ 939 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 940 PetscInt *colmap; 941 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 942 943 PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 944 945 ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 946 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 947 #if defined(PETSC_USE_64BIT_INDICES) 948 { // have to make a long version of these 949 int *h_bi32, *h_bj32; 950 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 951 ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 952 cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 953 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 954 cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 955 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 956 957 cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 958 cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 959 cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 960 cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 961 962 h_mat->offdiag.i = d_bi64; 963 h_mat->offdiag.j = d_bj64; 964 h_mat->allocated_indices = PETSC_TRUE; 965 966 ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 967 } 968 #else 969 h_mat->offdiag.i = (PetscInt*)bi; 970 h_mat->offdiag.j = (PetscInt*)bj; 971 h_mat->allocated_indices = PETSC_FALSE; 972 #endif 973 h_mat->offdiag.a = ba; 974 h_mat->offdiag.n = A->rmap->n; 975 976 cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 977 cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 978 ierr = PetscFree(colmap);CHKERRQ(ierr); 979 } 980 h_mat->rstart = A->rmap->rstart; 981 h_mat->rend = A->rmap->rend; 982 h_mat->cstart = A->cmap->rstart; 983 h_mat->cend = A->cmap->rend; 984 h_mat->M = A->cmap->N; 985 #if defined(PETSC_USE_64BIT_INDICES) 986 { 987 PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 988 int *h_ai32, *h_aj32; 989 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 990 ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 991 cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 992 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 993 cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 994 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 995 996 cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 997 cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 998 cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 999 cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 1000 1001 h_mat->diag.i = d_ai64; 1002 h_mat->diag.j = d_aj64; 1003 h_mat->allocated_indices = PETSC_TRUE; 1004 1005 ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 1006 } 1007 #else 1008 h_mat->diag.i = (PetscInt*)ai; 1009 h_mat->diag.j = (PetscInt*)aj; 1010 h_mat->allocated_indices = PETSC_FALSE; 1011 #endif 1012 h_mat->diag.a = aa; 1013 h_mat->diag.n = A->rmap->n; 1014 h_mat->rank = PetscGlobalRank; 1015 // copy pointers and metadata to device 1016 cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 1017 ierr = PetscFree(h_mat);CHKERRQ(ierr); 1018 } else { 1019 ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 1020 } 1021 *B = d_mat; 1022 A->offloadmask = PETSC_OFFLOAD_GPU; 1023 PetscFunctionReturn(0); 1024 } 1025