199acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 20fd2b57fSKarl Rupp 33d13b8fdSMatthew G. Knepley #include <petscconf.h> 49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 557d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 77e8381f9SStefano Zampini #include <thrust/advance.h> 8a2cee5feSJed Brown #include <thrust/partition.h> 9a2cee5feSJed Brown #include <thrust/sort.h> 10a2cee5feSJed Brown #include <thrust/unique.h> 114eb5378fSStefano Zampini #include <petscsf.h> 127e8381f9SStefano Zampini 139371c9d4SSatish Balay struct VecCUDAEquals { 147e8381f9SStefano Zampini template <typename Tuple> 15d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 16d71ae5a4SJacob Faibussowitsch { 177e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 187e8381f9SStefano Zampini } 197e8381f9SStefano Zampini }; 207e8381f9SStefano Zampini 21*2c4ab24aSJunchao Zhang static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data) 22d71ae5a4SJacob Faibussowitsch { 23*2c4ab24aSJunchao Zhang MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data; 24cbc6b225SStefano Zampini 25cbc6b225SStefano Zampini PetscFunctionBegin; 26*2c4ab24aSJunchao Zhang PetscCall(PetscSFDestroy(&coo->sf)); 27*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Ajmap1)); 28*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Aperm1)); 29*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bjmap1)); 30*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bperm1)); 31*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Aimap2)); 32*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Ajmap2)); 33*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Aperm2)); 34*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bimap2)); 35*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bjmap2)); 36*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bperm2)); 37*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Cperm1)); 38*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->sendbuf)); 39*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->recvbuf)); 40*2c4ab24aSJunchao Zhang PetscCall(PetscFree(coo)); 413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 427e8381f9SStefano Zampini } 439ae82921SPaul Mullowney 44d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 45d71ae5a4SJacob Faibussowitsch { 46cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 47*2c4ab24aSJunchao Zhang PetscBool dev_ij = PETSC_FALSE; 48*2c4ab24aSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_HOST; 49*2c4ab24aSJunchao Zhang PetscInt *i, *j; 50*2c4ab24aSJunchao Zhang PetscContainer container_h, container_d; 51*2c4ab24aSJunchao Zhang MatCOOStruct_MPIAIJ *coo_h, *coo_d; 52219fbbafSJunchao Zhang 53219fbbafSJunchao Zhang PetscFunctionBegin; 549566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->garray)); 559566063dSJacob Faibussowitsch PetscCall(VecDestroy(&mpiaij->lvec)); 56cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE) 57eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 58cbc6b225SStefano Zampini #else 599566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->colmap)); 60cbc6b225SStefano Zampini #endif 619566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 62cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE; 63cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE; 649566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 65*2c4ab24aSJunchao Zhang if (PetscMemTypeDevice(mtype)) { 66*2c4ab24aSJunchao Zhang dev_ij = PETSC_TRUE; 67*2c4ab24aSJunchao Zhang PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 68*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 69*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 70219fbbafSJunchao Zhang } else { 71*2c4ab24aSJunchao Zhang i = coo_i; 72*2c4ab24aSJunchao Zhang j = coo_j; 73*2c4ab24aSJunchao Zhang } 74*2c4ab24aSJunchao Zhang 75*2c4ab24aSJunchao Zhang PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j)); 76*2c4ab24aSJunchao Zhang if (dev_ij) PetscCall(PetscFree2(i, j)); 77cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 78*2c4ab24aSJunchao Zhang // Create the GPU memory 799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 81219fbbafSJunchao Zhang 82*2c4ab24aSJunchao Zhang // Copy the COO struct to device 83*2c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 84*2c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 85*2c4ab24aSJunchao Zhang PetscCall(PetscMalloc1(1, &coo_d)); 86*2c4ab24aSJunchao Zhang *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d 87219fbbafSJunchao Zhang 88*2c4ab24aSJunchao Zhang PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d 89*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount))); 90*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount))); 91*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount))); 92*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount))); 93*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount))); 94*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount))); 95*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount))); 96*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount))); 97*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount))); 98*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount))); 99*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount))); 100*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar))); 101*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar))); 102219fbbafSJunchao Zhang 103*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 104*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 105*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 106*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 107*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 108*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 109*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 110*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 111*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 112*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 113*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 114219fbbafSJunchao Zhang 115*2c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 116*2c4ab24aSJunchao Zhang PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 117*2c4ab24aSJunchao Zhang PetscCall(PetscContainerSetPointer(container_d, coo_d)); 118*2c4ab24aSJunchao Zhang PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE)); 119*2c4ab24aSJunchao Zhang PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 120*2c4ab24aSJunchao Zhang PetscCall(PetscContainerDestroy(&container_d)); 1213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 122219fbbafSJunchao Zhang } 123219fbbafSJunchao Zhang 124d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 125d71ae5a4SJacob Faibussowitsch { 126219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 127219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 128219fbbafSJunchao Zhang for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 129219fbbafSJunchao Zhang } 130219fbbafSJunchao Zhang 131d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 132d71ae5a4SJacob Faibussowitsch { 133219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 134219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 135158ec288SJunchao Zhang for (; i < Annz + Bnnz; i += grid_size) { 136158ec288SJunchao Zhang PetscScalar sum = 0.0; 137158ec288SJunchao Zhang if (i < Annz) { 138158ec288SJunchao Zhang for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 139158ec288SJunchao Zhang Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 140158ec288SJunchao Zhang } else { 141158ec288SJunchao Zhang i -= Annz; 142158ec288SJunchao Zhang for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 143158ec288SJunchao Zhang Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 144158ec288SJunchao Zhang } 145158ec288SJunchao Zhang } 146158ec288SJunchao Zhang } 147158ec288SJunchao Zhang 148d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 149d71ae5a4SJacob Faibussowitsch { 150158ec288SJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 151158ec288SJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 152158ec288SJunchao Zhang for (; i < Annz2 + Bnnz2; i += grid_size) { 153158ec288SJunchao Zhang if (i < Annz2) { 154158ec288SJunchao Zhang for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 155158ec288SJunchao Zhang } else { 156158ec288SJunchao Zhang i -= Annz2; 157158ec288SJunchao Zhang for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 158158ec288SJunchao Zhang } 159158ec288SJunchao Zhang } 160219fbbafSJunchao Zhang } 161219fbbafSJunchao Zhang 162d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 163d71ae5a4SJacob Faibussowitsch { 164219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 165219fbbafSJunchao Zhang Mat A = mpiaij->A, B = mpiaij->B; 166*2c4ab24aSJunchao Zhang PetscScalar *Aa, *Ba; 167219fbbafSJunchao Zhang const PetscScalar *v1 = v; 168219fbbafSJunchao Zhang PetscMemType memtype; 169*2c4ab24aSJunchao Zhang PetscContainer container; 170*2c4ab24aSJunchao Zhang MatCOOStruct_MPIAIJ *coo; 171219fbbafSJunchao Zhang 172219fbbafSJunchao Zhang PetscFunctionBegin; 173*2c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 174*2c4ab24aSJunchao Zhang PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix"); 175*2c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 176cbc6b225SStefano Zampini 177*2c4ab24aSJunchao Zhang const auto &Annz = coo->Annz; 178*2c4ab24aSJunchao Zhang const auto &Annz2 = coo->Annz2; 179*2c4ab24aSJunchao Zhang const auto &Bnnz = coo->Bnnz; 180*2c4ab24aSJunchao Zhang const auto &Bnnz2 = coo->Bnnz2; 181*2c4ab24aSJunchao Zhang const auto &vsend = coo->sendbuf; 182*2c4ab24aSJunchao Zhang const auto &v2 = coo->recvbuf; 183*2c4ab24aSJunchao Zhang const auto &Ajmap1 = coo->Ajmap1; 184*2c4ab24aSJunchao Zhang const auto &Ajmap2 = coo->Ajmap2; 185*2c4ab24aSJunchao Zhang const auto &Aimap2 = coo->Aimap2; 186*2c4ab24aSJunchao Zhang const auto &Bjmap1 = coo->Bjmap1; 187*2c4ab24aSJunchao Zhang const auto &Bjmap2 = coo->Bjmap2; 188*2c4ab24aSJunchao Zhang const auto &Bimap2 = coo->Bimap2; 189*2c4ab24aSJunchao Zhang const auto &Aperm1 = coo->Aperm1; 190*2c4ab24aSJunchao Zhang const auto &Aperm2 = coo->Aperm2; 191*2c4ab24aSJunchao Zhang const auto &Bperm1 = coo->Bperm1; 192*2c4ab24aSJunchao Zhang const auto &Bperm2 = coo->Bperm2; 193*2c4ab24aSJunchao Zhang const auto &Cperm1 = coo->Cperm1; 194*2c4ab24aSJunchao Zhang 1959566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 196219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 197*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 198*2c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 199219fbbafSJunchao Zhang } 200219fbbafSJunchao Zhang 201219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 2029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 2039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 204219fbbafSJunchao Zhang } else { 2059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 206158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 207219fbbafSJunchao Zhang } 208219fbbafSJunchao Zhang 209219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 210*2c4ab24aSJunchao Zhang if (coo->sendlen) { 211*2c4ab24aSJunchao Zhang MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend); 2129371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); 213cbc6b225SStefano Zampini } 214219fbbafSJunchao Zhang 215219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 216*2c4ab24aSJunchao Zhang PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 217219fbbafSJunchao Zhang /* Add local entries to A and B */ 218158ec288SJunchao Zhang if (Annz + Bnnz > 0) { 219158ec288SJunchao Zhang MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 2209566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 221cbc6b225SStefano Zampini } 222*2c4ab24aSJunchao Zhang PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 223219fbbafSJunchao Zhang 224219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 225158ec288SJunchao Zhang if (Annz2 + Bnnz2 > 0) { 226158ec288SJunchao Zhang MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 2279566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 228cbc6b225SStefano Zampini } 229219fbbafSJunchao Zhang 230219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 2319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 232158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 233219fbbafSJunchao Zhang } else { 2349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 235158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 236219fbbafSJunchao Zhang } 2379566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 238cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU; 2393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 240219fbbafSJunchao Zhang } 241219fbbafSJunchao Zhang 242d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 243d71ae5a4SJacob Faibussowitsch { 244ed502f03SStefano Zampini Mat Ad, Ao; 245ed502f03SStefano Zampini const PetscInt *cmap; 246ed502f03SStefano Zampini 247ed502f03SStefano Zampini PetscFunctionBegin; 2489566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 2499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 250ed502f03SStefano Zampini if (glob) { 251ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 252ed502f03SStefano Zampini 2539566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 2549566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ao, NULL, &on)); 2559566063dSJacob Faibussowitsch PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 2569566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(dn + on, &gidx)); 257ed502f03SStefano Zampini for (i = 0; i < dn; i++) gidx[i] = cst + i; 258ed502f03SStefano Zampini for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 2599566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 260ed502f03SStefano Zampini } 2613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 262ed502f03SStefano Zampini } 263ed502f03SStefano Zampini 264d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 265d71ae5a4SJacob Faibussowitsch { 266bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 267bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 2689ae82921SPaul Mullowney PetscInt i; 2699ae82921SPaul Mullowney 2709ae82921SPaul Mullowney PetscFunctionBegin; 2719566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->rmap)); 2729566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->cmap)); 2737e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 274ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 2759ae82921SPaul Mullowney } 2767e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 277ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 2789ae82921SPaul Mullowney } 2796a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 280eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&b->colmap)); 2816a29ce69SStefano Zampini #else 2829566063dSJacob Faibussowitsch PetscCall(PetscFree(b->colmap)); 2836a29ce69SStefano Zampini #endif 2849566063dSJacob Faibussowitsch PetscCall(PetscFree(b->garray)); 2859566063dSJacob Faibussowitsch PetscCall(VecDestroy(&b->lvec)); 2869566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&b->Mvctx)); 2876a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 2889566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 2896a29ce69SStefano Zampini if (!b->A) { 2909566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 2919566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 2926a29ce69SStefano Zampini } 2936a29ce69SStefano Zampini if (!b->B) { 2946a29ce69SStefano Zampini PetscMPIInt size; 2959566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2969566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 2979566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 2989ae82921SPaul Mullowney } 2999566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 3009566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 3019566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 3029566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 3039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 3049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 3059566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 3069566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 3079ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 3083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3099ae82921SPaul Mullowney } 310e057df02SPaul Mullowney 311d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 312d71ae5a4SJacob Faibussowitsch { 3139ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 3149ae82921SPaul Mullowney 3159ae82921SPaul Mullowney PetscFunctionBegin; 3169566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 3179566063dSJacob Faibussowitsch PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 3189566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 3199566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 3203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3219ae82921SPaul Mullowney } 322ca45077fSPaul Mullowney 323d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 324d71ae5a4SJacob Faibussowitsch { 3253fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 3263fa6b06aSMark Adams 3273fa6b06aSMark Adams PetscFunctionBegin; 3289566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->A)); 3299566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->B)); 3303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3313fa6b06aSMark Adams } 332042217e8SBarry Smith 333d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 334d71ae5a4SJacob Faibussowitsch { 335fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 336fdc842d1SBarry Smith 337fdc842d1SBarry Smith PetscFunctionBegin; 3389566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 3399566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 3409566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 3419566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 3423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 343fdc842d1SBarry Smith } 344fdc842d1SBarry Smith 345d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 346d71ae5a4SJacob Faibussowitsch { 347ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 348ca45077fSPaul Mullowney 349ca45077fSPaul Mullowney PetscFunctionBegin; 3509566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 3519566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 3529566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 3539566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 3543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 355ca45077fSPaul Mullowney } 3569ae82921SPaul Mullowney 357d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 358d71ae5a4SJacob Faibussowitsch { 359ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 360bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 361e057df02SPaul Mullowney 362ca45077fSPaul Mullowney PetscFunctionBegin; 363e057df02SPaul Mullowney switch (op) { 364d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_DIAG: 365d71ae5a4SJacob Faibussowitsch cusparseStruct->diagGPUMatFormat = format; 366d71ae5a4SJacob Faibussowitsch break; 367d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_OFFDIAG: 368d71ae5a4SJacob Faibussowitsch cusparseStruct->offdiagGPUMatFormat = format; 369d71ae5a4SJacob Faibussowitsch break; 370e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 371e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 372e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 373045c96e1SPaul Mullowney break; 374d71ae5a4SJacob Faibussowitsch default: 375d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 376045c96e1SPaul Mullowney } 3773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 378ca45077fSPaul Mullowney } 379e057df02SPaul Mullowney 380d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 381d71ae5a4SJacob Faibussowitsch { 382e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 383e057df02SPaul Mullowney PetscBool flg; 384a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 385a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 3865fd66863SKarl Rupp 387e057df02SPaul Mullowney PetscFunctionBegin; 388d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 389e057df02SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 3909371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 3919566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 3929371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 3939566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 3949371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 3959566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 396e057df02SPaul Mullowney } 397d0609cedSBarry Smith PetscOptionsHeadEnd(); 3983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 399e057df02SPaul Mullowney } 400e057df02SPaul Mullowney 401d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 402d71ae5a4SJacob Faibussowitsch { 4033fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 404042217e8SBarry Smith 40534d6c7a5SJose E. Roman PetscFunctionBegin; 4069566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 4079566063dSJacob Faibussowitsch if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 4083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 40934d6c7a5SJose E. Roman } 41034d6c7a5SJose E. Roman 411d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 412d71ae5a4SJacob Faibussowitsch { 4133fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 4143fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 415bbf3fe20SPaul Mullowney 416bbf3fe20SPaul Mullowney PetscFunctionBegin; 41728b400f6SJacob Faibussowitsch PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 418acbf8a88SJunchao Zhang PetscCallCXX(delete cusparseStruct); 4199566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 4209566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 4219566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 4229566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 4239566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 4249566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 4259566063dSJacob Faibussowitsch PetscCall(MatDestroy_MPIAIJ(A)); 4263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 427bbf3fe20SPaul Mullowney } 428ca45077fSPaul Mullowney 42926cec326SBarry Smith /* defines MatSetValues_MPICUSPARSE_Hash() */ 43026cec326SBarry Smith #define TYPE AIJ 43126cec326SBarry Smith #define TYPE_AIJ 43226cec326SBarry Smith #define SUB_TYPE_CUSPARSE 43326cec326SBarry Smith #include "../src/mat/impls/aij/mpi/mpihashmat.h" 43426cec326SBarry Smith #undef TYPE 43526cec326SBarry Smith #undef TYPE_AIJ 43626cec326SBarry Smith #undef SUB_TYPE_CUSPARSE 43726cec326SBarry Smith 43826cec326SBarry Smith static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 43926cec326SBarry Smith { 44026cec326SBarry Smith Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 44126cec326SBarry Smith Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 44226cec326SBarry Smith 44326cec326SBarry Smith PetscFunctionBegin; 44426cec326SBarry Smith PetscCall(MatSetUp_MPI_Hash(A)); 44526cec326SBarry Smith PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 44626cec326SBarry Smith PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 44726cec326SBarry Smith A->preallocated = PETSC_TRUE; 44826cec326SBarry Smith PetscFunctionReturn(PETSC_SUCCESS); 44926cec326SBarry Smith } 45026cec326SBarry Smith 4518eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 452d71ae5a4SJacob Faibussowitsch { 453bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 4543338378cSStefano Zampini Mat A; 4559ae82921SPaul Mullowney 4569ae82921SPaul Mullowney PetscFunctionBegin; 4579566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 4589566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 4599566063dSJacob Faibussowitsch else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 4603338378cSStefano Zampini A = *newmat; 4616f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 4629566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 4639566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 46434136279SStefano Zampini 465bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ *)A->data; 4669566063dSJacob Faibussowitsch if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 4679566063dSJacob Faibussowitsch if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 4689566063dSJacob Faibussowitsch if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 469d98d7c49SStefano Zampini 47048a46eb9SPierre Jolivet if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 4712205254eSKarl Rupp 47234d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 473bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 474fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 475bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 476bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 477bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 4783fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 4794e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 48026cec326SBarry Smith A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 4812205254eSKarl Rupp 4829566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 4839566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 4849566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 4859566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 4869566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 4879566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 488ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 4899566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 490ae48a8d0SStefano Zampini #endif 4913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4929ae82921SPaul Mullowney } 4939ae82921SPaul Mullowney 494d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 495d71ae5a4SJacob Faibussowitsch { 4963338378cSStefano Zampini PetscFunctionBegin; 4979566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 4989566063dSJacob Faibussowitsch PetscCall(MatCreate_MPIAIJ(A)); 4999566063dSJacob Faibussowitsch PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 5003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5013338378cSStefano Zampini } 5023338378cSStefano Zampini 5033f9c0db1SPaul Mullowney /*@ 50411a5261eSBarry Smith MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 505e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 50620f4b53cSBarry Smith to NVIDIA GPUs and use the CuSPARSE library for calculations. 5079ae82921SPaul Mullowney 508d083f849SBarry Smith Collective 509e057df02SPaul Mullowney 510e057df02SPaul Mullowney Input Parameters: 51111a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 51220f4b53cSBarry Smith . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 51320f4b53cSBarry Smith This value should be the same as the local size used in creating the 51420f4b53cSBarry Smith y vector for the matrix-vector product y = Ax. 51520f4b53cSBarry Smith . n - This value should be the same as the local size used in creating the 51620f4b53cSBarry Smith x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 51720f4b53cSBarry Smith calculated if `N` is given) For square matrices `n` is almost always `m`. 51820f4b53cSBarry Smith . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 51920f4b53cSBarry Smith . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 52020f4b53cSBarry Smith . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 52120f4b53cSBarry Smith (same value is used for all local rows) 52220f4b53cSBarry Smith . d_nnz - array containing the number of nonzeros in the various rows of the 52320f4b53cSBarry Smith DIAGONAL portion of the local submatrix (possibly different for each row) 52420f4b53cSBarry Smith or `NULL`, if `d_nz` is used to specify the nonzero structure. 52520f4b53cSBarry Smith The size of this array is equal to the number of local rows, i.e `m`. 52620f4b53cSBarry Smith For matrices you plan to factor you must leave room for the diagonal entry and 52720f4b53cSBarry Smith put in the entry even if it is zero. 52820f4b53cSBarry Smith . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 52920f4b53cSBarry Smith submatrix (same value is used for all local rows). 53020f4b53cSBarry Smith - o_nnz - array containing the number of nonzeros in the various rows of the 53120f4b53cSBarry Smith OFF-DIAGONAL portion of the local submatrix (possibly different for 53220f4b53cSBarry Smith each row) or `NULL`, if `o_nz` is used to specify the nonzero 53320f4b53cSBarry Smith structure. The size of this array is equal to the number 53420f4b53cSBarry Smith of local rows, i.e `m`. 535e057df02SPaul Mullowney 536e057df02SPaul Mullowney Output Parameter: 537e057df02SPaul Mullowney . A - the matrix 538e057df02SPaul Mullowney 5392ef1f0ffSBarry Smith Level: intermediate 5402ef1f0ffSBarry Smith 5412ef1f0ffSBarry Smith Notes: 54211a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 543e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 54411a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 545e057df02SPaul Mullowney 54611a5261eSBarry Smith The AIJ format, also called the 5472ef1f0ffSBarry Smith compressed row storage), is fully compatible with standard Fortran 548e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 5492ef1f0ffSBarry Smith either one (as in Fortran) or zero. 550e057df02SPaul Mullowney 5512ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 552e057df02SPaul Mullowney @*/ 553d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 554d71ae5a4SJacob Faibussowitsch { 5559ae82921SPaul Mullowney PetscMPIInt size; 5569ae82921SPaul Mullowney 5579ae82921SPaul Mullowney PetscFunctionBegin; 5589566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 5599566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, M, N)); 5609566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size)); 5619ae82921SPaul Mullowney if (size > 1) { 5629566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 5639566063dSJacob Faibussowitsch PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 5649ae82921SPaul Mullowney } else { 5659566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 5669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 5679ae82921SPaul Mullowney } 5683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5699ae82921SPaul Mullowney } 5709ae82921SPaul Mullowney 5713ca39a21SBarry Smith /*MC 57211a5261eSBarry Smith MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 573e057df02SPaul Mullowney 57411a5261eSBarry Smith A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 5752692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 57611a5261eSBarry Smith All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 5779ae82921SPaul Mullowney 57811a5261eSBarry Smith This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 57911a5261eSBarry Smith and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 58011a5261eSBarry Smith `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 5819ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 5829ae82921SPaul Mullowney the above preallocation routines for simplicity. 5839ae82921SPaul Mullowney 5849ae82921SPaul Mullowney Options Database Keys: 5852ef1f0ffSBarry Smith + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` 5862ef1f0ffSBarry Smith . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid). 5872ef1f0ffSBarry Smith . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 5882ef1f0ffSBarry Smith - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 5899ae82921SPaul Mullowney 5909ae82921SPaul Mullowney Level: beginner 5919ae82921SPaul Mullowney 5922ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 5936bb69460SJunchao Zhang M*/ 5946bb69460SJunchao Zhang 5956bb69460SJunchao Zhang /*MC 59611a5261eSBarry Smith MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 5976bb69460SJunchao Zhang 5986bb69460SJunchao Zhang Level: beginner 5996bb69460SJunchao Zhang 6002ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 6019ae82921SPaul Mullowney M*/ 602