1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals 15 { 16 template <typename Tuple> 17 __host__ __device__ 18 void operator()(Tuple t) 19 { 20 thrust::get<1>(t) = thrust::get<0>(t); 21 } 22 }; 23 24 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 25 { 26 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 27 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 28 PetscInt n = cusp->coo_nd + cusp->coo_no; 29 PetscErrorCode ierr; 30 31 PetscFunctionBegin; 32 if (cusp->coo_p && v) { 33 thrust::device_ptr<const PetscScalar> d_v; 34 THRUSTARRAY *w = NULL; 35 36 if (isCudaMem(v)) { 37 d_v = thrust::device_pointer_cast(v); 38 } else { 39 w = new THRUSTARRAY(n); 40 w->assign(v,v+n); 41 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 42 d_v = w->data(); 43 } 44 45 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 46 cusp->coo_pw->begin())); 47 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 48 cusp->coo_pw->end())); 49 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 50 thrust::for_each(zibit,zieit,VecCUDAEquals()); 51 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52 delete w; 53 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 55 } else { 56 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr); 57 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 58 } 59 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 60 A->num_ass++; 61 A->assembled = PETSC_TRUE; 62 A->ass_nonzerostate = A->nonzerostate; 63 A->offloadmask = PETSC_OFFLOAD_GPU; 64 PetscFunctionReturn(0); 65 } 66 67 template <typename Tuple> 68 struct IsNotOffDiagT 69 { 70 PetscInt _cstart,_cend; 71 72 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 73 __host__ __device__ 74 inline bool operator()(Tuple t) 75 { 76 return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 77 } 78 }; 79 80 struct IsOffDiag 81 { 82 PetscInt _cstart,_cend; 83 84 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 85 __host__ __device__ 86 inline bool operator() (const PetscInt &c) 87 { 88 return c < _cstart || c >= _cend; 89 } 90 }; 91 92 struct GlobToLoc 93 { 94 PetscInt _start; 95 96 GlobToLoc(PetscInt start) : _start(start) {} 97 __host__ __device__ 98 inline PetscInt operator() (const PetscInt &c) 99 { 100 return c - _start; 101 } 102 }; 103 104 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 105 { 106 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 107 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 108 PetscErrorCode ierr; 109 PetscInt N,*jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 cudaError_t cerr; 115 116 PetscFunctionBegin; 117 if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 118 if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 119 ierr = PetscFree(b->garray);CHKERRQ(ierr); 120 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 121 ierr = MatDestroy(&b->A);CHKERRQ(ierr); 122 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 123 124 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 125 d_i.assign(coo_i,coo_i+n); 126 d_j.assign(coo_j,coo_j+n); 127 delete cusp->coo_p; 128 delete cusp->coo_pw; 129 cusp->coo_p = NULL; 130 cusp->coo_pw = NULL; 131 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 132 auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 133 auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 134 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 135 cusp->coo_p = new THRUSTINTARRAY(n); 136 cusp->coo_pw = new THRUSTARRAY(n); 137 thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 138 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 139 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 140 auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 141 firstoffd = mzipp.get_iterator_tuple().get<1>(); 142 } 143 cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 144 cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 145 146 /* from global to local */ 147 thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 148 thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 149 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 150 151 /* copy offdiag column indices to map on the CPU */ 152 ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 153 cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 154 auto o_j = d_j.begin(); 155 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 156 thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 157 thrust::sort(thrust::device,o_j,d_j.end()); 158 auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 159 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 160 noff = thrust::distance(o_j,wit); 161 ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 162 cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 163 ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 164 ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 165 ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 166 ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr); 167 PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 168 ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 169 170 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 171 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 172 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 173 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 174 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 175 ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 176 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 177 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 178 179 /* GPU memory, cusparse specific call handles it internally */ 180 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 181 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 182 ierr = PetscFree(jj);CHKERRQ(ierr); 183 184 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 185 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 186 ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 187 ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 188 /* 189 ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 190 ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 191 */ 192 ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 193 B->preallocated = PETSC_TRUE; 194 B->nonzerostate++; 195 196 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 197 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 198 B->offloadmask = PETSC_OFFLOAD_CPU; 199 B->assembled = PETSC_FALSE; 200 B->was_assembled = PETSC_FALSE; 201 PetscFunctionReturn(0); 202 } 203 204 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 205 { 206 PetscErrorCode ierr; 207 Mat newmat; 208 Mat_MPIAIJ *mpiaij; 209 Mat_MPIAIJCUSPARSE *mpidev; 210 PetscInt coo_basic = 1; 211 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 212 PetscInt rstart,rend; 213 cudaError_t cerr; 214 215 PetscFunctionBegin; 216 if (coo_i) { 217 ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 218 ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 219 if (PetscMemTypeHost(mtype)) { 220 for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 221 if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;} 222 } 223 } 224 } 225 /* All ranks must agree on the value of coo_basic */ 226 ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr); 227 228 if (coo_basic) { 229 ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 230 } else { 231 mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 232 ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 233 ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 234 ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr); 235 ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */ 236 ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr); 237 ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 238 ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 239 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 240 mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */ 241 mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 242 ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);CHKERRQ(ierr); 243 ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);CHKERRQ(ierr); 244 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 245 mpidev->use_extended_coo = PETSC_TRUE; 246 247 cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr); 248 cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 249 cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr); 250 251 cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr); 252 cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 253 cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr); 254 255 cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr); 256 cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 257 cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr); 258 259 cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr); 260 cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 261 cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr); 262 263 cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr); 264 cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 265 cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 266 267 cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 268 cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 269 cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 270 271 cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 272 cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 273 cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 274 275 cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 276 cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 277 cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 278 279 cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 280 cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 281 cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 282 283 cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 284 } 285 PetscFunctionReturn(0); 286 } 287 288 __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 289 { 290 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 291 const PetscCount grid_size = gridDim.x * blockDim.x; 292 for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 293 } 294 295 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 296 { 297 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 298 const PetscCount grid_size = gridDim.x * blockDim.x; 299 for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 300 } 301 302 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 303 { 304 PetscErrorCode ierr; 305 cudaError_t cerr; 306 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 307 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 308 Mat A = mpiaij->A,B = mpiaij->B; 309 PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 310 PetscScalar *Aa,*Ba; 311 PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 312 const PetscScalar *v1 = v; 313 const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 314 const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 315 const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 316 const PetscCount *Cperm1 = mpidev->Cperm1_d; 317 PetscMemType memtype; 318 319 PetscFunctionBegin; 320 if (mpidev->use_extended_coo) { 321 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 322 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 323 cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 324 cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 325 } 326 327 if (imode == INSERT_VALUES) { 328 ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 329 ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr); 330 cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 331 cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 332 } else { 333 ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */ 334 ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); 335 } 336 337 /* Pack entries to be sent to remote */ 338 MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 339 340 /* Send remote entries to their owner and overlap the communication with local computation */ 341 ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); 342 /* Add local entries to A and B */ 343 MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 344 MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 345 ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); 346 347 /* Add received remote entries to A and B */ 348 MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 349 MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 350 351 if (imode == INSERT_VALUES) { 352 ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr); 353 ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); 354 } else { 355 ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr); 356 ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); 357 } 358 if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 359 } else { 360 ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr); 361 } 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 366 { 367 Mat Ad,Ao; 368 const PetscInt *cmap; 369 PetscErrorCode ierr; 370 371 PetscFunctionBegin; 372 ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 373 ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 374 if (glob) { 375 PetscInt cst, i, dn, on, *gidx; 376 377 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 378 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 379 ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 380 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 381 for (i=0; i<dn; i++) gidx[i] = cst + i; 382 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 383 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 384 } 385 PetscFunctionReturn(0); 386 } 387 388 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 389 { 390 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 391 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 392 PetscErrorCode ierr; 393 PetscInt i; 394 395 PetscFunctionBegin; 396 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 397 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 398 if (PetscDefined(USE_DEBUG) && d_nnz) { 399 for (i=0; i<B->rmap->n; i++) { 400 PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 401 } 402 } 403 if (PetscDefined(USE_DEBUG) && o_nnz) { 404 for (i=0; i<B->rmap->n; i++) { 405 PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 406 } 407 } 408 #if defined(PETSC_USE_CTABLE) 409 ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 410 #else 411 ierr = PetscFree(b->colmap);CHKERRQ(ierr); 412 #endif 413 ierr = PetscFree(b->garray);CHKERRQ(ierr); 414 ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 415 ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 416 /* Because the B will have been resized we simply destroy it and create a new one each time */ 417 ierr = MatDestroy(&b->B);CHKERRQ(ierr); 418 if (!b->A) { 419 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 420 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 421 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 422 } 423 if (!b->B) { 424 PetscMPIInt size; 425 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 426 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 427 ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 428 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 429 } 430 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 431 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 432 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 433 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 434 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 435 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 436 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 437 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 438 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 439 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 440 /* Let A, B use b's handle with pre-set stream 441 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 442 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 443 */ 444 B->preallocated = PETSC_TRUE; 445 PetscFunctionReturn(0); 446 } 447 448 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 449 { 450 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 451 PetscErrorCode ierr; 452 PetscInt nt; 453 454 PetscFunctionBegin; 455 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 456 PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 457 /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 458 if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 459 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 460 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 461 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 462 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 463 PetscFunctionReturn(0); 464 } 465 466 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 467 { 468 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 469 PetscErrorCode ierr; 470 471 PetscFunctionBegin; 472 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 473 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 474 PetscFunctionReturn(0); 475 } 476 477 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 478 { 479 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 480 PetscErrorCode ierr; 481 PetscInt nt; 482 483 PetscFunctionBegin; 484 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 485 PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 486 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 487 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 488 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 489 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 490 PetscFunctionReturn(0); 491 } 492 493 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 494 { 495 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 496 PetscErrorCode ierr; 497 PetscInt nt; 498 499 PetscFunctionBegin; 500 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 501 PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt); 502 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 503 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 504 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 505 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 506 PetscFunctionReturn(0); 507 } 508 509 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 510 { 511 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 512 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 513 514 PetscFunctionBegin; 515 switch (op) { 516 case MAT_CUSPARSE_MULT_DIAG: 517 cusparseStruct->diagGPUMatFormat = format; 518 break; 519 case MAT_CUSPARSE_MULT_OFFDIAG: 520 cusparseStruct->offdiagGPUMatFormat = format; 521 break; 522 case MAT_CUSPARSE_ALL: 523 cusparseStruct->diagGPUMatFormat = format; 524 cusparseStruct->offdiagGPUMatFormat = format; 525 break; 526 default: 527 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 528 } 529 PetscFunctionReturn(0); 530 } 531 532 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 533 { 534 MatCUSPARSEStorageFormat format; 535 PetscErrorCode ierr; 536 PetscBool flg; 537 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 538 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 539 540 PetscFunctionBegin; 541 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 542 if (A->factortype==MAT_FACTOR_NONE) { 543 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 544 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 545 if (flg) { 546 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 547 } 548 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 549 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 550 if (flg) { 551 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 552 } 553 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 554 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 555 if (flg) { 556 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 557 } 558 } 559 ierr = PetscOptionsTail();CHKERRQ(ierr); 560 PetscFunctionReturn(0); 561 } 562 563 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 564 { 565 PetscErrorCode ierr; 566 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 567 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 568 PetscObjectState onnz = A->nonzerostate; 569 570 PetscFunctionBegin; 571 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 572 if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 573 if (onnz != A->nonzerostate && cusp->deviceMat) { 574 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 575 cudaError_t cerr; 576 577 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 578 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 579 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 580 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 581 if (h_mat->allocated_indices) { 582 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 583 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 584 if (h_mat->offdiag.j) { 585 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 586 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 587 } 588 } 589 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 590 ierr = PetscFree(h_mat);CHKERRQ(ierr); 591 cusp->deviceMat = NULL; 592 } 593 PetscFunctionReturn(0); 594 } 595 596 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 597 { 598 PetscErrorCode ierr; 599 cudaError_t cerr; 600 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 601 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 602 cusparseStatus_t stat; 603 604 PetscFunctionBegin; 605 PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 606 if (cusparseStruct->deviceMat) { 607 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 608 609 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 610 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 611 cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 612 cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 613 if (h_mat->allocated_indices) { 614 cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 615 cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 616 if (h_mat->offdiag.j) { 617 cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 618 cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 619 } 620 } 621 cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 622 ierr = PetscFree(h_mat);CHKERRQ(ierr); 623 } 624 try { 625 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 626 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 627 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 628 /* We want cusparseStruct to use PetscDefaultCudaStream 629 if (cusparseStruct->stream) { 630 cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 631 } 632 */ 633 /* Extended COO */ 634 if (cusparseStruct->use_extended_coo) { 635 cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr); 636 cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr); 637 cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr); 638 cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr); 639 cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr); 640 cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr); 641 cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr); 642 cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr); 643 cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr); 644 cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr); 645 cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr); 646 cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr); 647 cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr); 648 cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr); 649 cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr); 650 } else { 651 delete cusparseStruct->coo_p; 652 delete cusparseStruct->coo_pw; 653 } 654 delete cusparseStruct; 655 } catch(char *ex) { 656 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 657 } 658 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 659 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 660 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 661 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 662 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 663 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 664 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 665 PetscFunctionReturn(0); 666 } 667 668 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 669 { 670 PetscErrorCode ierr; 671 Mat_MPIAIJ *a; 672 cusparseStatus_t stat; 673 Mat A; 674 675 PetscFunctionBegin; 676 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 677 if (reuse == MAT_INITIAL_MATRIX) { 678 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 679 } else if (reuse == MAT_REUSE_MATRIX) { 680 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 681 } 682 A = *newmat; 683 A->boundtocpu = PETSC_FALSE; 684 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 685 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 686 687 a = (Mat_MPIAIJ*)A->data; 688 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 689 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 690 if (a->lvec) { 691 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 692 } 693 694 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 695 Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 696 stat = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat); 697 a->spptr = cusp; 698 } 699 700 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 701 A->ops->mult = MatMult_MPIAIJCUSPARSE; 702 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 703 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 704 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 705 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 706 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 707 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 708 709 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 710 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 711 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 712 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 713 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 714 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 715 #if defined(PETSC_HAVE_HYPRE) 716 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 717 #endif 718 PetscFunctionReturn(0); 719 } 720 721 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 722 { 723 PetscErrorCode ierr; 724 725 PetscFunctionBegin; 726 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 727 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 728 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 729 PetscFunctionReturn(0); 730 } 731 732 /*@ 733 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 734 (the default parallel PETSc format). This matrix will ultimately pushed down 735 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 736 assembly performance the user should preallocate the matrix storage by setting 737 the parameter nz (or the array nnz). By setting these parameters accurately, 738 performance during matrix assembly can be increased by more than a factor of 50. 739 740 Collective 741 742 Input Parameters: 743 + comm - MPI communicator, set to PETSC_COMM_SELF 744 . m - number of rows 745 . n - number of columns 746 . nz - number of nonzeros per row (same for all rows) 747 - nnz - array containing the number of nonzeros in the various rows 748 (possibly different for each row) or NULL 749 750 Output Parameter: 751 . A - the matrix 752 753 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 754 MatXXXXSetPreallocation() paradigm instead of this routine directly. 755 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 756 757 Notes: 758 If nnz is given then nz is ignored 759 760 The AIJ format (also called the Yale sparse matrix format or 761 compressed row storage), is fully compatible with standard Fortran 77 762 storage. That is, the stored row and column indices can begin at 763 either one (as in Fortran) or zero. See the users' manual for details. 764 765 Specify the preallocated storage with either nz or nnz (not both). 766 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 767 allocation. For large problems you MUST preallocate memory or you 768 will get TERRIBLE performance, see the users' manual chapter on matrices. 769 770 By default, this format uses inodes (identical nodes) when possible, to 771 improve numerical efficiency of matrix-vector products and solves. We 772 search for consecutive rows with the same nonzero structure, thereby 773 reusing matrix information to achieve increased efficiency. 774 775 Level: intermediate 776 777 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 778 @*/ 779 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 780 { 781 PetscErrorCode ierr; 782 PetscMPIInt size; 783 784 PetscFunctionBegin; 785 ierr = MatCreate(comm,A);CHKERRQ(ierr); 786 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 787 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 788 if (size > 1) { 789 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 790 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 791 } else { 792 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 793 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 794 } 795 PetscFunctionReturn(0); 796 } 797 798 /*MC 799 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 800 801 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 802 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 803 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 804 805 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 806 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 807 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 808 for communicators controlling multiple processes. It is recommended that you call both of 809 the above preallocation routines for simplicity. 810 811 Options Database Keys: 812 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 813 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 814 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 815 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 816 817 Level: beginner 818 819 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 820 M*/ 821 822 /*MC 823 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 824 825 Level: beginner 826 827 .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 828 M*/ 829 830 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 831 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 832 { 833 PetscSplitCSRDataStructure d_mat; 834 PetscMPIInt size; 835 PetscErrorCode ierr; 836 int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 837 PetscScalar *aa = NULL,*ba = NULL; 838 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 839 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 840 CsrMatrix *matrixA = NULL,*matrixB = NULL; 841 842 PetscFunctionBegin; 843 PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 844 if (A->factortype != MAT_FACTOR_NONE) { 845 *B = NULL; 846 PetscFunctionReturn(0); 847 } 848 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 849 // get jaca 850 if (size == 1) { 851 PetscBool isseqaij; 852 853 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 854 if (isseqaij) { 855 jaca = (Mat_SeqAIJ*)A->data; 856 PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 857 cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 858 d_mat = cusparsestructA->deviceMat; 859 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 860 } else { 861 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 862 PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 863 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 864 jaca = (Mat_SeqAIJ*)aij->A->data; 865 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 866 d_mat = spptr->deviceMat; 867 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 868 } 869 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 870 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 871 PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 872 matrixA = (CsrMatrix*)matstruct->mat; 873 bi = NULL; 874 bj = NULL; 875 ba = NULL; 876 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 877 } else { 878 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 879 PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 880 jaca = (Mat_SeqAIJ*)aij->A->data; 881 jacb = (Mat_SeqAIJ*)aij->B->data; 882 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 883 884 PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 885 cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 886 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 887 PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 888 if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 889 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 890 ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 891 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 892 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 893 PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 894 PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 895 matrixA = (CsrMatrix*)matstructA->mat; 896 matrixB = (CsrMatrix*)matstructB->mat; 897 if (jacb->compressedrow.use) { 898 if (!cusparsestructB->rowoffsets_gpu) { 899 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 900 cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 901 } 902 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 903 } else { 904 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 905 } 906 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 907 ba = thrust::raw_pointer_cast(matrixB->values->data()); 908 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 909 d_mat = spptr->deviceMat; 910 } 911 if (jaca->compressedrow.use) { 912 if (!cusparsestructA->rowoffsets_gpu) { 913 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 914 cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 915 } 916 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 917 } else { 918 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 919 } 920 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 921 aa = thrust::raw_pointer_cast(matrixA->values->data()); 922 923 if (!d_mat) { 924 cudaError_t cerr; 925 PetscSplitCSRDataStructure h_mat; 926 927 // create and populate strucy on host and copy on device 928 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 929 ierr = PetscNew(&h_mat);CHKERRQ(ierr); 930 cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 931 if (size > 1) { /* need the colmap array */ 932 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 933 PetscInt *colmap; 934 PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 935 936 PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 937 938 ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 939 for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 940 #if defined(PETSC_USE_64BIT_INDICES) 941 { // have to make a long version of these 942 int *h_bi32, *h_bj32; 943 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 944 ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 945 cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 946 for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 947 cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 948 for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 949 950 cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 951 cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 952 cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 953 cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 954 955 h_mat->offdiag.i = d_bi64; 956 h_mat->offdiag.j = d_bj64; 957 h_mat->allocated_indices = PETSC_TRUE; 958 959 ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 960 } 961 #else 962 h_mat->offdiag.i = (PetscInt*)bi; 963 h_mat->offdiag.j = (PetscInt*)bj; 964 h_mat->allocated_indices = PETSC_FALSE; 965 #endif 966 h_mat->offdiag.a = ba; 967 h_mat->offdiag.n = A->rmap->n; 968 969 cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 970 cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 971 ierr = PetscFree(colmap);CHKERRQ(ierr); 972 } 973 h_mat->rstart = A->rmap->rstart; 974 h_mat->rend = A->rmap->rend; 975 h_mat->cstart = A->cmap->rstart; 976 h_mat->cend = A->cmap->rend; 977 h_mat->M = A->cmap->N; 978 #if defined(PETSC_USE_64BIT_INDICES) 979 { 980 PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 981 int *h_ai32, *h_aj32; 982 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 983 ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 984 cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 985 for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 986 cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 987 for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 988 989 cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 990 cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 991 cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 992 cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 993 994 h_mat->diag.i = d_ai64; 995 h_mat->diag.j = d_aj64; 996 h_mat->allocated_indices = PETSC_TRUE; 997 998 ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 999 } 1000 #else 1001 h_mat->diag.i = (PetscInt*)ai; 1002 h_mat->diag.j = (PetscInt*)aj; 1003 h_mat->allocated_indices = PETSC_FALSE; 1004 #endif 1005 h_mat->diag.a = aa; 1006 h_mat->diag.n = A->rmap->n; 1007 h_mat->rank = PetscGlobalRank; 1008 // copy pointers and metadata to device 1009 cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 1010 ierr = PetscFree(h_mat);CHKERRQ(ierr); 1011 } else { 1012 ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 1013 } 1014 *B = d_mat; 1015 A->offloadmask = PETSC_OFFLOAD_GPU; 1016 PetscFunctionReturn(0); 1017 } 1018