1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 25 { 26 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 27 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 28 PetscInt n = cusp->coo_nd + cusp->coo_no; 29 PetscErrorCode ierr; 30 31 PetscFunctionBegin; 32 if (cusp->coo_p && v) { 33 thrust::device_ptr<const PetscScalar> d_v; 34 THRUSTARRAY *w = NULL; 35 36 if (isCudaMem(v)) { 37 d_v = thrust::device_pointer_cast(v); 38 } else { 39 w = new THRUSTARRAY(n); 40 w->assign(v,v+n); 41 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 42 d_v = w->data(); 43 } 44 45 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 46 cusp->coo_pw->begin())); 47 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 48 cusp->coo_pw->end())); 49 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 50 thrust::for_each(zibit,zieit,VecCUDAEquals()); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 60 A->num_ass++; 61 A->assembled = PETSC_TRUE; 62 A->ass_nonzerostate = A->nonzerostate; 63 A->offloadmask = PETSC_OFFLOAD_GPU; 64 PetscFunctionReturn(0); 65 } 66 67 template <typename Tuple> 68 struct IsNotOffDiagT 69 { 70 PetscInt _cstart,_cend; 71 72 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 73 __host__ __device__ 74 inline bool operator()(Tuple t) 75 { 76 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 77 } 78 }; 79 80 struct IsOffDiag 81 { 82 PetscInt _cstart,_cend; 83 84 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 85 __host__ __device__ 86 inline bool operator() (const PetscInt &c) 87 { 88 return c < _cstart || c >= _cend; 89 } 90 }; 91 92 struct GlobToLoc 93 { 94 PetscInt _start; 95 96 GlobToLoc(PetscInt start) : _start(start) {} 97 __host__ __device__ 98 inline PetscInt operator() (const PetscInt &c) 99 { 100 return c - _start; 101 } 102 }; 103 104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 105 { 106 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 107 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 108 PetscErrorCode ierr; 109 PetscInt N,*jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 cudaError_t cerr; 115 116 PetscFunctionBegin; 117 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 118 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 119 ierr = PetscFree(b->garray);CHKERRQ(ierr); 120 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 121 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 122 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 123 124 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 125 d_i.assign(coo_i,coo_i+n); 126 d_j.assign(coo_j,coo_j+n); 127 delete cusp->coo_p; 128 delete cusp->coo_pw; 129 cusp->coo_p = NULL; 130 cusp->coo_pw = NULL; 131 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 132 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 133 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 134 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 135 cusp->coo_p = new THRUSTINTARRAY(n); 136 cusp->coo_pw = new THRUSTARRAY(n); 137 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 138 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 139 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 140 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 141 firstoffd = mzipp.get_iterator_tuple().get<1>(); 142 } 143 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 144 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 145 146 /* from global to local */ 147 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 148 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 149 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 150 151 /* copy offdiag column indices to map on the CPU */ 152 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 153 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 154 auto o_j = d_j.begin(); 155 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 156 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 157 thrust::sort(thrust::device,o_j,d_j.end()); 158 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 159 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 160 noff = thrust::distance(o_j,wit); 161 ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 162 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 163 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 164 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 165 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 166 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr); 167 PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 168 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 169 170 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 171 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 172 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 173 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 174 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 175 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 176 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 177 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 178 179 /* GPU memory, cusparse specific call handles it internally */ 180 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 181 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 182 ierr = PetscFree(jj);CHKERRQ(ierr); 183 184 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 185 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 186 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 187 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 188 /* 189 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 190 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 191 */ 192 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 193 B->preallocated = PETSC_TRUE; 194 B->nonzerostate++; 195 196 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 197 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 198 B->offloadmask = PETSC_OFFLOAD_CPU; 199 B->assembled = PETSC_FALSE; 200 B->was_assembled = PETSC_FALSE; 201 PetscFunctionReturn(0); 202 } 203 204 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 205 { 206 PetscErrorCode ierr; 207 Mat newmat; 208 Mat_MPIAIJ *mpiaij; 209 Mat_MPIAIJCUSPARSE *mpidev; 210 PetscInt coo_basic = 1; 211 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 212 PetscInt rstart,rend; 213 cudaError_t cerr; 214 215 PetscFunctionBegin; 216 if (coo_i) { 217 ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 218 ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 219 if (PetscMemTypeHost(mtype)) { 220 for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 221 if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;} 222 } 223 } 224 } 225 /* All ranks must agree on the value of coo_basic */ 226 ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr); 227 228 if (coo_basic) { 229 ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 230 } else { 231 mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 232 ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 233 ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 234 ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr); 235 ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */ 236 ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr); 237 ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 238 ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 239 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 240 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 241 mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */ 242 mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 243 mpidev->use_extended_coo = PETSC_TRUE; 244 245 cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr); 246 cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 247 cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr); 248 249 cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr); 250 cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 251 cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr); 252 253 cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr); 254 cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 255 cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr); 256 257 cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr); 258 cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 259 cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr); 260 261 cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr); 262 cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 263 cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 264 265 cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 266 cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 267 cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 268 269 cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 270 cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 271 cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 272 273 cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 274 cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 275 cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 276 277 cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 278 cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 279 cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 280 281 cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 282 } 283 PetscFunctionReturn(0); 284 } 285 286 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 287 { 288 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 289 const PetscCount grid_size = gridDim.x * blockDim.x; 290 for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 291 } 292 293 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 294 { 295 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 296 const PetscCount grid_size = gridDim.x * blockDim.x; 297 for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 298 } 299 300 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 301 { 302 PetscErrorCode ierr; 303 cudaError_t cerr; 304 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 305 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 306 Mat A = mpiaij->A,B = mpiaij->B; 307 PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 308 PetscScalar *Aa,*Ba; 309 PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 310 const PetscScalar *v1 = v; 311 const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 312 const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 313 const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 314 const PetscCount *Cperm1 = mpidev->Cperm1_d; 315 PetscMemType memtype; 316 317 PetscFunctionBegin; 318 if (mpidev->use_extended_coo) { 319 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 320 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 321 cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 322 if (v) {cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);} 323 else {cerr = cudaMemset((void*)v1,0,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);} 324 } 325 326 if (imode == INSERT_VALUES) { 327 ierr = MatZeroEntries(A);CHKERRQ(ierr); 328 ierr = MatZeroEntries(B);CHKERRQ(ierr); 329 ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 330 ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr); 331 } else { 332 ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */ 333 ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); 334 } 335 336 /* Pack entries to be sent to remote */ 337 MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 338 339 /* Send remote entries to their owner and overlap the communication with local computation */ 340 ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); 341 /* Add local entries to A and B */ 342 MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 343 MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 344 ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); 345 346 /* Add received remote entries to A and B */ 347 MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 348 MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 349 350 if (imode == INSERT_VALUES) { 351 ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr); 352 ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); 353 } else { 354 ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr); 355 ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); 356 } 357 if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 358 } else { 359 ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr); 360 } 361 PetscFunctionReturn(0); 362 } 363 364 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 365 { 366 Mat Ad,Ao; 367 const PetscInt *cmap; 368 PetscErrorCode ierr; 369 370 PetscFunctionBegin; 371 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 372 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 373 if (glob) { 374 PetscInt cst, i, dn, on, *gidx; 375 376 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 377 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 378 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 379 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 380 for (i=0; i<dn; i++) gidx[i] = cst + i; 381 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 382 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 383 } 384 PetscFunctionReturn(0); 385 } 386 387 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 388 { 389 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 390 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 391 PetscErrorCode ierr; 392 PetscInt i; 393 394 PetscFunctionBegin; 395 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 396 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 397 if (PetscDefined(USE_DEBUG) && d_nnz) { 398 for (i=0; i<B->rmap->n; i++) { 399 PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 400 } 401 } 402 if (PetscDefined(USE_DEBUG) && o_nnz) { 403 for (i=0; i<B->rmap->n; i++) { 404 PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 405 } 406 } 407 #if defined(PETSC_USE_CTABLE) 408 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 409 #else 410 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 411 #endif 412 ierr = PetscFree(b->garray);CHKERRQ(ierr); 413 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 414 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 415 /* Because the B will have been resized we simply destroy it and create a new one each time */ 416 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 417 if (!b->A) { 418 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 419 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 420 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 421 } 422 if (!b->B) { 423 PetscMPIInt size; 424 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 425 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 426 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 427 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 428 } 429 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 430 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 431 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 432 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 433 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 434 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 435 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 436 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 437 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 438 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 439 /* Let A, B use b's handle with pre-set stream 440 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 441 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 442 */ 443 B->preallocated = PETSC_TRUE; 444 PetscFunctionReturn(0); 445 } 446 447 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 448 { 449 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 450 PetscErrorCode ierr; 451 PetscInt nt; 452 453 PetscFunctionBegin; 454 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 455 PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 456 /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 457 if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 458 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 459 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 460 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 461 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 462 PetscFunctionReturn(0); 463 } 464 465 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 466 { 467 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 468 PetscErrorCode ierr; 469 470 PetscFunctionBegin; 471 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 472 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 473 PetscFunctionReturn(0); 474 } 475 476 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 477 { 478 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 479 PetscErrorCode ierr; 480 PetscInt nt; 481 482 PetscFunctionBegin; 483 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 484 PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 485 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 486 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 487 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 488 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 489 PetscFunctionReturn(0); 490 } 491 492 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 493 { 494 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 495 PetscErrorCode ierr; 496 PetscInt nt; 497 498 PetscFunctionBegin; 499 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 500 PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt); 501 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 502 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 503 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 504 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 505 PetscFunctionReturn(0); 506 } 507 508 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 509 { 510 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 511 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 512 513 PetscFunctionBegin; 514 switch (op) { 515 case MAT_CUSPARSE_MULT_DIAG: 516 cusparseStruct->diagGPUMatFormat = format; 517 break; 518 case MAT_CUSPARSE_MULT_OFFDIAG: 519 cusparseStruct->offdiagGPUMatFormat = format; 520 break; 521 case MAT_CUSPARSE_ALL: 522 cusparseStruct->diagGPUMatFormat = format; 523 cusparseStruct->offdiagGPUMatFormat = format; 524 break; 525 default: 526 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 527 } 528 PetscFunctionReturn(0); 529 } 530 531 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 532 { 533 MatCUSPARSEStorageFormat format; 534 PetscErrorCode ierr; 535 PetscBool flg; 536 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 537 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 538 539 PetscFunctionBegin; 540 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 541 if (A->factortype==MAT_FACTOR_NONE) { 542 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 543 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 544 if (flg) { 545 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 546 } 547 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 548 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 549 if (flg) { 550 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 551 } 552 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 553 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 554 if (flg) { 555 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 556 } 557 } 558 ierr = PetscOptionsTail();CHKERRQ(ierr); 559 PetscFunctionReturn(0); 560 } 561 562 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 563 { 564 PetscErrorCode ierr; 565 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 566 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 567 PetscObjectState onnz = A->nonzerostate; 568 569 PetscFunctionBegin; 570 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 571 if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 572 if (onnz != A->nonzerostate && cusp->deviceMat) { 573 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 574 cudaError_t cerr; 575 576 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 577 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 578 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 579 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 580 if (h_mat->allocated_indices) { 581 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 582 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 583 if (h_mat->offdiag.j) { 584 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 585 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 586 } 587 } 588 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 589 ierr = PetscFree(h_mat);CHKERRQ(ierr); 590 cusp->deviceMat = NULL; 591 } 592 PetscFunctionReturn(0); 593 } 594 595 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 596 { 597 PetscErrorCode ierr; 598 cudaError_t cerr; 599 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 600 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 601 cusparseStatus_t stat; 602 603 PetscFunctionBegin; 604 PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 605 if (cusparseStruct->deviceMat) { 606 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 607 608 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 609 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 610 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 611 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 612 if (h_mat->allocated_indices) { 613 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 614 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 615 if (h_mat->offdiag.j) { 616 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 617 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 618 } 619 } 620 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 621 ierr = PetscFree(h_mat);CHKERRQ(ierr); 622 } 623 try { 624 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 625 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 626 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 627 /* We want cusparseStruct to use PetscDefaultCudaStream 628 if (cusparseStruct->stream) { 629 cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 630 } 631 */ 632 /* Extended COO */ 633 if (cusparseStruct->use_extended_coo) { 634 cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr); 635 cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr); 636 cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr); 637 cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr); 638 cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr); 639 cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr); 640 cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr); 641 cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr); 642 cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr); 643 cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr); 644 cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr); 645 cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr); 646 cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr); 647 cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr); 648 cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr); 649 } else { 650 delete cusparseStruct->coo_p; 651 delete cusparseStruct->coo_pw; 652 } 653 delete cusparseStruct; 654 } catch(char *ex) { 655 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 656 } 657 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 658 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 659 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 660 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 661 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 662 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 663 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 664 PetscFunctionReturn(0); 665 } 666 667 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 668 { 669 PetscErrorCode ierr; 670 Mat_MPIAIJ *a; 671 cusparseStatus_t stat; 672 Mat A; 673 674 PetscFunctionBegin; 675 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 676 if (reuse == MAT_INITIAL_MATRIX) { 677 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 678 } else if (reuse == MAT_REUSE_MATRIX) { 679 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 680 } 681 A = *newmat; 682 A->boundtocpu = PETSC_FALSE; 683 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 684 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 685 686 a = (Mat_MPIAIJ*)A->data; 687 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 688 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 689 if (a->lvec) { 690 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 691 } 692 693 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 694 Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 695 stat = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat); 696 a->spptr = cusp; 697 } 698 699 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 700 A->ops->mult = MatMult_MPIAIJCUSPARSE; 701 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 702 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 703 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 704 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 705 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 706 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 707 708 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 709 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 710 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 711 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 712 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 713 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 714 #if defined(PETSC_HAVE_HYPRE) 715 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 716 #endif 717 PetscFunctionReturn(0); 718 } 719 720 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 721 { 722 PetscErrorCode ierr; 723 724 PetscFunctionBegin; 725 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 726 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 727 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 728 PetscFunctionReturn(0); 729 } 730 731 /*@ 732 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 733 (the default parallel PETSc format). This matrix will ultimately pushed down 734 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 735 assembly performance the user should preallocate the matrix storage by setting 736 the parameter nz (or the array nnz). By setting these parameters accurately, 737 performance during matrix assembly can be increased by more than a factor of 50. 738 739 Collective 740 741 Input Parameters: 742 + comm - MPI communicator, set to PETSC_COMM_SELF 743 . m - number of rows 744 . n - number of columns 745 . nz - number of nonzeros per row (same for all rows) 746 - nnz - array containing the number of nonzeros in the various rows 747 (possibly different for each row) or NULL 748 749 Output Parameter: 750 . A - the matrix 751 752 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 753 MatXXXXSetPreallocation() paradigm instead of this routine directly. 754 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 755 756 Notes: 757 If nnz is given then nz is ignored 758 759 The AIJ format (also called the Yale sparse matrix format or 760 compressed row storage), is fully compatible with standard Fortran 77 761 storage. That is, the stored row and column indices can begin at 762 either one (as in Fortran) or zero. See the users' manual for details. 763 764 Specify the preallocated storage with either nz or nnz (not both). 765 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 766 allocation. For large problems you MUST preallocate memory or you 767 will get TERRIBLE performance, see the users' manual chapter on matrices. 768 769 By default, this format uses inodes (identical nodes) when possible, to 770 improve numerical efficiency of matrix-vector products and solves. We 771 search for consecutive rows with the same nonzero structure, thereby 772 reusing matrix information to achieve increased efficiency. 773 774 Level: intermediate 775 776 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 777 @*/ 778 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 779 { 780 PetscErrorCode ierr; 781 PetscMPIInt size; 782 783 PetscFunctionBegin; 784 ierr = MatCreate(comm,A);CHKERRQ(ierr); 785 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 786 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 787 if (size > 1) { 788 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 789 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 790 } else { 791 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 792 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 793 } 794 PetscFunctionReturn(0); 795 } 796 797 /*MC 798 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 799 800 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 801 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 802 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 803 804 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 805 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 806 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 807 for communicators controlling multiple processes. It is recommended that you call both of 808 the above preallocation routines for simplicity. 809 810 Options Database Keys: 811 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 812 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 813 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 814 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 815 816 Level: beginner 817 818 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 819 M*/ 820 821 /*MC 822 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 823 824 Level: beginner 825 826 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 827 M*/ 828 829 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 830 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 831 { 832 PetscSplitCSRDataStructure d_mat; 833 PetscMPIInt size; 834 PetscErrorCode ierr; 835 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 836 PetscScalar *aa = NULL,*ba = NULL; 837 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 838 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 839 CsrMatrix *matrixA = NULL,*matrixB = NULL; 840 841 PetscFunctionBegin; 842 PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 843 if (A->factortype != MAT_FACTOR_NONE) { 844 *B = NULL; 845 PetscFunctionReturn(0); 846 } 847 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 848 // get jaca 849 if (size == 1) { 850 PetscBool isseqaij; 851 852 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 853 if (isseqaij) { 854 jaca = (Mat_SeqAIJ*)A->data; 855 PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 856 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 857 d_mat = cusparsestructA->deviceMat; 858 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 859 } else { 860 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 861 PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 862 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 863 jaca = (Mat_SeqAIJ*)aij->A->data; 864 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 865 d_mat = spptr->deviceMat; 866 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 867 } 868 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 869 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 870 PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 871 matrixA = (CsrMatrix*)matstruct->mat; 872 bi = NULL; 873 bj = NULL; 874 ba = NULL; 875 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 876 } else { 877 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 878 PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 879 jaca = (Mat_SeqAIJ*)aij->A->data; 880 jacb = (Mat_SeqAIJ*)aij->B->data; 881 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 882 883 PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 884 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 885 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 886 PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 887 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 888 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 889 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 890 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 891 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 892 PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 893 PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 894 matrixA = (CsrMatrix*)matstructA->mat; 895 matrixB = (CsrMatrix*)matstructB->mat; 896 if (jacb->compressedrow.use) { 897 if (!cusparsestructB->rowoffsets_gpu) { 898 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 899 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 900 } 901 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 902 } else { 903 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 904 } 905 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 906 ba = thrust::raw_pointer_cast(matrixB->values->data()); 907 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 908 d_mat = spptr->deviceMat; 909 } 910 if (jaca->compressedrow.use) { 911 if (!cusparsestructA->rowoffsets_gpu) { 912 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 913 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 914 } 915 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 916 } else { 917 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 918 } 919 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 920 aa = thrust::raw_pointer_cast(matrixA->values->data()); 921 922 if (!d_mat) { 923 cudaError_t cerr; 924 PetscSplitCSRDataStructure h_mat; 925 926 // create and populate strucy on host and copy on device 927 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 928 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 929 cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 930 if (size > 1) { /* need the colmap array */ 931 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 932 PetscInt *colmap; 933 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 934 935 PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 936 937 ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 938 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 939 #if defined(PETSC_USE_64BIT_INDICES) 940 { // have to make a long version of these 941 int *h_bi32, *h_bj32; 942 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 943 ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 944 cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 945 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 946 cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 947 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 948 949 cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 950 cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 951 cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 952 cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 953 954 h_mat->offdiag.i = d_bi64; 955 h_mat->offdiag.j = d_bj64; 956 h_mat->allocated_indices = PETSC_TRUE; 957 958 ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 959 } 960 #else 961 h_mat->offdiag.i = (PetscInt*)bi; 962 h_mat->offdiag.j = (PetscInt*)bj; 963 h_mat->allocated_indices = PETSC_FALSE; 964 #endif 965 h_mat->offdiag.a = ba; 966 h_mat->offdiag.n = A->rmap->n; 967 968 cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 969 cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 970 ierr = PetscFree(colmap);CHKERRQ(ierr); 971 } 972 h_mat->rstart = A->rmap->rstart; 973 h_mat->rend = A->rmap->rend; 974 h_mat->cstart = A->cmap->rstart; 975 h_mat->cend = A->cmap->rend; 976 h_mat->M = A->cmap->N; 977 #if defined(PETSC_USE_64BIT_INDICES) 978 { 979 PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 980 int *h_ai32, *h_aj32; 981 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 982 ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 983 cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 984 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 985 cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 986 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 987 988 cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 989 cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 990 cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 991 cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 992 993 h_mat->diag.i = d_ai64; 994 h_mat->diag.j = d_aj64; 995 h_mat->allocated_indices = PETSC_TRUE; 996 997 ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 998 } 999 #else 1000 h_mat->diag.i = (PetscInt*)ai; 1001 h_mat->diag.j = (PetscInt*)aj; 1002 h_mat->allocated_indices = PETSC_FALSE; 1003 #endif 1004 h_mat->diag.a = aa; 1005 h_mat->diag.n = A->rmap->n; 1006 h_mat->rank = PetscGlobalRank; 1007 // copy pointers and metadata to device 1008 cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 1009 ierr = PetscFree(h_mat);CHKERRQ(ierr); 1010 } else { 1011 ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 1012 } 1013 *B = d_mat; 1014 A->offloadmask = PETSC_OFFLOAD_GPU; 1015 PetscFunctionReturn(0); 1016 } 1017