1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 25 { 26 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 27 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 28 PetscInt n = cusp->coo_nd + cusp->coo_no; 29 PetscErrorCode ierr; 30 31 PetscFunctionBegin; 32 if (cusp->coo_p && v) { 33 thrust::device_ptr<const PetscScalar> d_v; 34 THRUSTARRAY *w = NULL; 35 36 if (isCudaMem(v)) { 37 d_v = thrust::device_pointer_cast(v); 38 } else { 39 w = new THRUSTARRAY(n); 40 w->assign(v,v+n); 41 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 42 d_v = w->data(); 43 } 44 45 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 46 cusp->coo_pw->begin())); 47 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 48 cusp->coo_pw->end())); 49 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 50 thrust::for_each(zibit,zieit,VecCUDAEquals()); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 60 A->num_ass++; 61 A->assembled = PETSC_TRUE; 62 A->ass_nonzerostate = A->nonzerostate; 63 A->offloadmask = PETSC_OFFLOAD_GPU; 64 PetscFunctionReturn(0); 65 } 66 67 template <typename Tuple> 68 struct IsNotOffDiagT 69 { 70 PetscInt _cstart,_cend; 71 72 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 73 __host__ __device__ 74 inline bool operator()(Tuple t) 75 { 76 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 77 } 78 }; 79 80 struct IsOffDiag 81 { 82 PetscInt _cstart,_cend; 83 84 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 85 __host__ __device__ 86 inline bool operator() (const PetscInt &c) 87 { 88 return c < _cstart || c >= _cend; 89 } 90 }; 91 92 struct GlobToLoc 93 { 94 PetscInt _start; 95 96 GlobToLoc(PetscInt start) : _start(start) {} 97 __host__ __device__ 98 inline PetscInt operator() (const PetscInt &c) 99 { 100 return c - _start; 101 } 102 }; 103 104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 105 { 106 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 107 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 108 PetscErrorCode ierr; 109 PetscInt N,*jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 cudaError_t cerr; 115 116 PetscFunctionBegin; 117 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 118 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 119 ierr = PetscFree(b->garray);CHKERRQ(ierr); 120 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 121 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 122 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 123 124 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 125 d_i.assign(coo_i,coo_i+n); 126 d_j.assign(coo_j,coo_j+n); 127 delete cusp->coo_p; 128 delete cusp->coo_pw; 129 cusp->coo_p = NULL; 130 cusp->coo_pw = NULL; 131 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 132 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 133 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 134 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 135 cusp->coo_p = new THRUSTINTARRAY(n); 136 cusp->coo_pw = new THRUSTARRAY(n); 137 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 138 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 139 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 140 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 141 firstoffd = mzipp.get_iterator_tuple().get<1>(); 142 } 143 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 144 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 145 146 /* from global to local */ 147 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 148 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 149 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 150 151 /* copy offdiag column indices to map on the CPU */ 152 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 153 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 154 auto o_j = d_j.begin(); 155 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 156 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 157 thrust::sort(thrust::device,o_j,d_j.end()); 158 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 159 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 160 noff = thrust::distance(o_j,wit); 161 ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 162 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 163 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 164 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 165 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 166 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr); 167 PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 168 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 169 170 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 171 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 172 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 173 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 174 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 175 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 176 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 177 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 178 179 /* GPU memory, cusparse specific call handles it internally */ 180 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 181 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 182 ierr = PetscFree(jj);CHKERRQ(ierr); 183 184 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 185 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 186 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 187 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 188 /* 189 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 190 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 191 */ 192 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 193 B->preallocated = PETSC_TRUE; 194 B->nonzerostate++; 195 196 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 197 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 198 B->offloadmask = PETSC_OFFLOAD_CPU; 199 B->assembled = PETSC_FALSE; 200 B->was_assembled = PETSC_FALSE; 201 PetscFunctionReturn(0); 202 } 203 204 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 205 { 206 PetscErrorCode ierr; 207 Mat newmat; 208 Mat_MPIAIJ *mpiaij; 209 Mat_MPIAIJCUSPARSE *mpidev; 210 PetscInt coo_basic = 1; 211 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 212 PetscInt rstart,rend; 213 cudaError_t cerr; 214 215 PetscFunctionBegin; 216 if (coo_i) { 217 ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 218 ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 219 if (PetscMemTypeHost(mtype)) { 220 for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 221 if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;} 222 } 223 } 224 } 225 /* All ranks must agree on the value of coo_basic */ 226 ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr); 227 228 if (coo_basic) { 229 ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 230 } else { 231 mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 232 ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 233 ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 234 ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr); 235 ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */ 236 ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr); 237 ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 238 ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 239 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 240 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 241 mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */ 242 mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 243 mpidev->use_extended_coo = PETSC_TRUE; 244 245 cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr); 246 cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 247 cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr); 248 249 cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr); 250 cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 251 cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr); 252 253 cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr); 254 cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 255 cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr); 256 257 cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr); 258 cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 259 cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr); 260 261 cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr); 262 cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 263 cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 264 265 cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 266 cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 267 cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 268 269 cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 270 cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 271 cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 272 273 cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 274 cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 275 cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 276 277 cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 278 cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 279 cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 280 281 cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 282 } 283 PetscFunctionReturn(0); 284 } 285 286 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 287 { 288 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 289 const PetscCount grid_size = gridDim.x * blockDim.x; 290 for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 291 } 292 293 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 294 { 295 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 296 const PetscCount grid_size = gridDim.x * blockDim.x; 297 for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 298 } 299 300 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 301 { 302 PetscErrorCode ierr; 303 cudaError_t cerr; 304 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 305 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 306 Mat A = mpiaij->A,B = mpiaij->B; 307 PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 308 PetscScalar *Aa,*Ba; 309 PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 310 const PetscScalar *v1 = v; 311 const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 312 const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 313 const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 314 const PetscCount *Cperm1 = mpidev->Cperm1_d; 315 PetscMemType memtype; 316 317 PetscFunctionBegin; 318 if (mpidev->use_extended_coo) { 319 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 320 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 321 cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 322 cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 323 } 324 325 if (imode == INSERT_VALUES) { 326 ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 327 ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr); 328 cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 329 cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 330 } else { 331 ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */ 332 ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); 333 } 334 335 /* Pack entries to be sent to remote */ 336 MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 337 338 /* Send remote entries to their owner and overlap the communication with local computation */ 339 ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); 340 /* Add local entries to A and B */ 341 MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 342 MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 343 ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); 344 345 /* Add received remote entries to A and B */ 346 MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 347 MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 348 349 if (imode == INSERT_VALUES) { 350 ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr); 351 ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); 352 } else { 353 ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr); 354 ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); 355 } 356 if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 357 } else { 358 ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr); 359 } 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 364 { 365 Mat Ad,Ao; 366 const PetscInt *cmap; 367 PetscErrorCode ierr; 368 369 PetscFunctionBegin; 370 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 371 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 372 if (glob) { 373 PetscInt cst, i, dn, on, *gidx; 374 375 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 376 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 377 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 378 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 379 for (i=0; i<dn; i++) gidx[i] = cst + i; 380 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 381 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 382 } 383 PetscFunctionReturn(0); 384 } 385 386 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 387 { 388 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 389 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 390 PetscErrorCode ierr; 391 PetscInt i; 392 393 PetscFunctionBegin; 394 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 395 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 396 if (PetscDefined(USE_DEBUG) && d_nnz) { 397 for (i=0; i<B->rmap->n; i++) { 398 PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 399 } 400 } 401 if (PetscDefined(USE_DEBUG) && o_nnz) { 402 for (i=0; i<B->rmap->n; i++) { 403 PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 404 } 405 } 406 #if defined(PETSC_USE_CTABLE) 407 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 408 #else 409 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 410 #endif 411 ierr = PetscFree(b->garray);CHKERRQ(ierr); 412 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 413 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 414 /* Because the B will have been resized we simply destroy it and create a new one each time */ 415 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 416 if (!b->A) { 417 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 418 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 419 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 420 } 421 if (!b->B) { 422 PetscMPIInt size; 423 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 424 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 425 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 426 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 427 } 428 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 429 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 430 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 431 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 432 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 433 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 434 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 435 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 436 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 437 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 438 /* Let A, B use b's handle with pre-set stream 439 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 440 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 441 */ 442 B->preallocated = PETSC_TRUE; 443 PetscFunctionReturn(0); 444 } 445 446 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 447 { 448 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 449 PetscErrorCode ierr; 450 PetscInt nt; 451 452 PetscFunctionBegin; 453 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 454 PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 455 /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 456 if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 457 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 458 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 459 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 460 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 461 PetscFunctionReturn(0); 462 } 463 464 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 465 { 466 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 467 PetscErrorCode ierr; 468 469 PetscFunctionBegin; 470 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 471 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 472 PetscFunctionReturn(0); 473 } 474 475 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 476 { 477 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 478 PetscErrorCode ierr; 479 PetscInt nt; 480 481 PetscFunctionBegin; 482 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 483 PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 484 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 485 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 486 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 487 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 488 PetscFunctionReturn(0); 489 } 490 491 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 492 { 493 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 494 PetscErrorCode ierr; 495 PetscInt nt; 496 497 PetscFunctionBegin; 498 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 499 PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt); 500 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 501 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 502 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 503 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 504 PetscFunctionReturn(0); 505 } 506 507 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 508 { 509 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 510 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 511 512 PetscFunctionBegin; 513 switch (op) { 514 case MAT_CUSPARSE_MULT_DIAG: 515 cusparseStruct->diagGPUMatFormat = format; 516 break; 517 case MAT_CUSPARSE_MULT_OFFDIAG: 518 cusparseStruct->offdiagGPUMatFormat = format; 519 break; 520 case MAT_CUSPARSE_ALL: 521 cusparseStruct->diagGPUMatFormat = format; 522 cusparseStruct->offdiagGPUMatFormat = format; 523 break; 524 default: 525 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 526 } 527 PetscFunctionReturn(0); 528 } 529 530 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 531 { 532 MatCUSPARSEStorageFormat format; 533 PetscErrorCode ierr; 534 PetscBool flg; 535 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 536 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 537 538 PetscFunctionBegin; 539 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 540 if (A->factortype==MAT_FACTOR_NONE) { 541 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 542 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 543 if (flg) { 544 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 545 } 546 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 547 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 548 if (flg) { 549 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 550 } 551 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 552 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 553 if (flg) { 554 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 555 } 556 } 557 ierr = PetscOptionsTail();CHKERRQ(ierr); 558 PetscFunctionReturn(0); 559 } 560 561 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 562 { 563 PetscErrorCode ierr; 564 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 565 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 566 PetscObjectState onnz = A->nonzerostate; 567 568 PetscFunctionBegin; 569 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 570 if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 571 if (onnz != A->nonzerostate && cusp->deviceMat) { 572 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 573 cudaError_t cerr; 574 575 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 576 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 577 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 578 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 579 if (h_mat->allocated_indices) { 580 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 581 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 582 if (h_mat->offdiag.j) { 583 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 584 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 585 } 586 } 587 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 588 ierr = PetscFree(h_mat);CHKERRQ(ierr); 589 cusp->deviceMat = NULL; 590 } 591 PetscFunctionReturn(0); 592 } 593 594 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 595 { 596 PetscErrorCode ierr; 597 cudaError_t cerr; 598 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 599 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 600 cusparseStatus_t stat; 601 602 PetscFunctionBegin; 603 PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 604 if (cusparseStruct->deviceMat) { 605 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 606 607 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 608 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 609 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 610 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 611 if (h_mat->allocated_indices) { 612 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 613 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 614 if (h_mat->offdiag.j) { 615 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 616 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 617 } 618 } 619 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 620 ierr = PetscFree(h_mat);CHKERRQ(ierr); 621 } 622 try { 623 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 624 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 625 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 626 /* We want cusparseStruct to use PetscDefaultCudaStream 627 if (cusparseStruct->stream) { 628 cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 629 } 630 */ 631 /* Extended COO */ 632 if (cusparseStruct->use_extended_coo) { 633 cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr); 634 cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr); 635 cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr); 636 cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr); 637 cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr); 638 cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr); 639 cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr); 640 cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr); 641 cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr); 642 cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr); 643 cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr); 644 cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr); 645 cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr); 646 cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr); 647 cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr); 648 } else { 649 delete cusparseStruct->coo_p; 650 delete cusparseStruct->coo_pw; 651 } 652 delete cusparseStruct; 653 } catch(char *ex) { 654 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 655 } 656 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 657 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 658 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 659 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 660 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 661 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 662 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 663 PetscFunctionReturn(0); 664 } 665 666 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 667 { 668 PetscErrorCode ierr; 669 Mat_MPIAIJ *a; 670 cusparseStatus_t stat; 671 Mat A; 672 673 PetscFunctionBegin; 674 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 675 if (reuse == MAT_INITIAL_MATRIX) { 676 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 677 } else if (reuse == MAT_REUSE_MATRIX) { 678 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 679 } 680 A = *newmat; 681 A->boundtocpu = PETSC_FALSE; 682 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 683 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 684 685 a = (Mat_MPIAIJ*)A->data; 686 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 687 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 688 if (a->lvec) { 689 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 690 } 691 692 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 693 Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 694 stat = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat); 695 a->spptr = cusp; 696 } 697 698 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 699 A->ops->mult = MatMult_MPIAIJCUSPARSE; 700 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 701 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 702 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 703 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 704 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 705 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 706 707 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 708 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 709 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 710 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 711 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 712 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 713 #if defined(PETSC_HAVE_HYPRE) 714 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 715 #endif 716 PetscFunctionReturn(0); 717 } 718 719 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 720 { 721 PetscErrorCode ierr; 722 723 PetscFunctionBegin; 724 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 725 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 726 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 727 PetscFunctionReturn(0); 728 } 729 730 /*@ 731 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 732 (the default parallel PETSc format). This matrix will ultimately pushed down 733 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 734 assembly performance the user should preallocate the matrix storage by setting 735 the parameter nz (or the array nnz). By setting these parameters accurately, 736 performance during matrix assembly can be increased by more than a factor of 50. 737 738 Collective 739 740 Input Parameters: 741 + comm - MPI communicator, set to PETSC_COMM_SELF 742 . m - number of rows 743 . n - number of columns 744 . nz - number of nonzeros per row (same for all rows) 745 - nnz - array containing the number of nonzeros in the various rows 746 (possibly different for each row) or NULL 747 748 Output Parameter: 749 . A - the matrix 750 751 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 752 MatXXXXSetPreallocation() paradigm instead of this routine directly. 753 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 754 755 Notes: 756 If nnz is given then nz is ignored 757 758 The AIJ format (also called the Yale sparse matrix format or 759 compressed row storage), is fully compatible with standard Fortran 77 760 storage. That is, the stored row and column indices can begin at 761 either one (as in Fortran) or zero. See the users' manual for details. 762 763 Specify the preallocated storage with either nz or nnz (not both). 764 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 765 allocation. For large problems you MUST preallocate memory or you 766 will get TERRIBLE performance, see the users' manual chapter on matrices. 767 768 By default, this format uses inodes (identical nodes) when possible, to 769 improve numerical efficiency of matrix-vector products and solves. We 770 search for consecutive rows with the same nonzero structure, thereby 771 reusing matrix information to achieve increased efficiency. 772 773 Level: intermediate 774 775 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 776 @*/ 777 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 778 { 779 PetscErrorCode ierr; 780 PetscMPIInt size; 781 782 PetscFunctionBegin; 783 ierr = MatCreate(comm,A);CHKERRQ(ierr); 784 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 785 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 786 if (size > 1) { 787 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 788 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 789 } else { 790 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 791 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 792 } 793 PetscFunctionReturn(0); 794 } 795 796 /*MC 797 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 798 799 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 800 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 801 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 802 803 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 804 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 805 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 806 for communicators controlling multiple processes. It is recommended that you call both of 807 the above preallocation routines for simplicity. 808 809 Options Database Keys: 810 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 811 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 812 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 813 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 814 815 Level: beginner 816 817 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 818 M*/ 819 820 /*MC 821 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 822 823 Level: beginner 824 825 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 826 M*/ 827 828 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 829 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 830 { 831 PetscSplitCSRDataStructure d_mat; 832 PetscMPIInt size; 833 PetscErrorCode ierr; 834 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 835 PetscScalar *aa = NULL,*ba = NULL; 836 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 837 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 838 CsrMatrix *matrixA = NULL,*matrixB = NULL; 839 840 PetscFunctionBegin; 841 PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 842 if (A->factortype != MAT_FACTOR_NONE) { 843 *B = NULL; 844 PetscFunctionReturn(0); 845 } 846 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 847 // get jaca 848 if (size == 1) { 849 PetscBool isseqaij; 850 851 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 852 if (isseqaij) { 853 jaca = (Mat_SeqAIJ*)A->data; 854 PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 855 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 856 d_mat = cusparsestructA->deviceMat; 857 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 858 } else { 859 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 860 PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 861 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 862 jaca = (Mat_SeqAIJ*)aij->A->data; 863 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 864 d_mat = spptr->deviceMat; 865 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 866 } 867 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 868 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 869 PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 870 matrixA = (CsrMatrix*)matstruct->mat; 871 bi = NULL; 872 bj = NULL; 873 ba = NULL; 874 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 875 } else { 876 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 877 PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 878 jaca = (Mat_SeqAIJ*)aij->A->data; 879 jacb = (Mat_SeqAIJ*)aij->B->data; 880 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 881 882 PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 883 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 884 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 885 PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 886 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 887 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 888 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 889 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 890 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 891 PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 892 PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 893 matrixA = (CsrMatrix*)matstructA->mat; 894 matrixB = (CsrMatrix*)matstructB->mat; 895 if (jacb->compressedrow.use) { 896 if (!cusparsestructB->rowoffsets_gpu) { 897 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 898 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 899 } 900 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 901 } else { 902 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 903 } 904 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 905 ba = thrust::raw_pointer_cast(matrixB->values->data()); 906 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 907 d_mat = spptr->deviceMat; 908 } 909 if (jaca->compressedrow.use) { 910 if (!cusparsestructA->rowoffsets_gpu) { 911 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 912 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 913 } 914 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 915 } else { 916 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 917 } 918 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 919 aa = thrust::raw_pointer_cast(matrixA->values->data()); 920 921 if (!d_mat) { 922 cudaError_t cerr; 923 PetscSplitCSRDataStructure h_mat; 924 925 // create and populate strucy on host and copy on device 926 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 927 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 928 cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 929 if (size > 1) { /* need the colmap array */ 930 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 931 PetscInt *colmap; 932 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 933 934 PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 935 936 ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 937 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 938 #if defined(PETSC_USE_64BIT_INDICES) 939 { // have to make a long version of these 940 int *h_bi32, *h_bj32; 941 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 942 ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 943 cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 944 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 945 cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 946 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 947 948 cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 949 cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 950 cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 951 cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 952 953 h_mat->offdiag.i = d_bi64; 954 h_mat->offdiag.j = d_bj64; 955 h_mat->allocated_indices = PETSC_TRUE; 956 957 ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 958 } 959 #else 960 h_mat->offdiag.i = (PetscInt*)bi; 961 h_mat->offdiag.j = (PetscInt*)bj; 962 h_mat->allocated_indices = PETSC_FALSE; 963 #endif 964 h_mat->offdiag.a = ba; 965 h_mat->offdiag.n = A->rmap->n; 966 967 cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 968 cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 969 ierr = PetscFree(colmap);CHKERRQ(ierr); 970 } 971 h_mat->rstart = A->rmap->rstart; 972 h_mat->rend = A->rmap->rend; 973 h_mat->cstart = A->cmap->rstart; 974 h_mat->cend = A->cmap->rend; 975 h_mat->M = A->cmap->N; 976 #if defined(PETSC_USE_64BIT_INDICES) 977 { 978 PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 979 int *h_ai32, *h_aj32; 980 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 981 ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 982 cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 983 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 984 cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 985 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 986 987 cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 988 cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 989 cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 990 cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 991 992 h_mat->diag.i = d_ai64; 993 h_mat->diag.j = d_aj64; 994 h_mat->allocated_indices = PETSC_TRUE; 995 996 ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 997 } 998 #else 999 h_mat->diag.i = (PetscInt*)ai; 1000 h_mat->diag.j = (PetscInt*)aj; 1001 h_mat->allocated_indices = PETSC_FALSE; 1002 #endif 1003 h_mat->diag.a = aa; 1004 h_mat->diag.n = A->rmap->n; 1005 h_mat->rank = PetscGlobalRank; 1006 // copy pointers and metadata to device 1007 cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 1008 ierr = PetscFree(h_mat);CHKERRQ(ierr); 1009 } else { 1010 ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 1011 } 1012 *B = d_mat; 1013 A->offloadmask = PETSC_OFFLOAD_GPU; 1014 PetscFunctionReturn(0); 1015 } 1016