1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #endif 19 #include <thrust/iterator/constant_iterator.h> 20 #include <thrust/remove.h> 21 #include <thrust/sort.h> 22 #include <thrust/unique.h> 23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST) 24 #include <cuda/std/functional> 25 #endif 26 27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 29 /* 30 The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 31 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 32 */ 33 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 34 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 36 #endif 37 38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 48 #endif 49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 59 60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 64 65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 67 68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 71 72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 73 { 74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 75 76 PetscFunctionBegin; 77 switch (op) { 78 case MAT_CUSPARSE_MULT: 79 cusparsestruct->format = format; 80 break; 81 case MAT_CUSPARSE_ALL: 82 cusparsestruct->format = format; 83 break; 84 default: 85 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 86 } 87 PetscFunctionReturn(PETSC_SUCCESS); 88 } 89 90 /*@ 91 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 92 operation. Only the `MatMult()` operation can use different GPU storage formats 93 94 Not Collective 95 96 Input Parameters: 97 + A - Matrix of type `MATSEQAIJCUSPARSE` 98 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 99 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 101 102 Level: intermediate 103 104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 105 @*/ 106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 107 { 108 PetscFunctionBegin; 109 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 110 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 117 118 PetscFunctionBegin; 119 cusparsestruct->use_cpu_solve = use_cpu; 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /*@ 124 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 125 126 Input Parameters: 127 + A - Matrix of type `MATSEQAIJCUSPARSE` 128 - use_cpu - set flag for using the built-in CPU `MatSolve()` 129 130 Level: intermediate 131 132 Note: 133 The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method 134 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there. 135 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 136 137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 138 @*/ 139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 140 { 141 PetscFunctionBegin; 142 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 143 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 148 { 149 PetscFunctionBegin; 150 switch (op) { 151 case MAT_FORM_EXPLICIT_TRANSPOSE: 152 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 153 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 154 A->form_explicit_transpose = flg; 155 break; 156 default: 157 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 158 break; 159 } 160 PetscFunctionReturn(PETSC_SUCCESS); 161 } 162 163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 164 { 165 MatCUSPARSEStorageFormat format; 166 PetscBool flg; 167 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 168 169 PetscFunctionBegin; 170 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 171 if (A->factortype == MAT_FACTOR_NONE) { 172 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 173 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 174 175 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 176 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 177 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 178 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 180 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 181 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 182 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 183 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 184 #else 185 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 186 #endif 187 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 188 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 189 190 PetscCall( 191 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 192 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 193 #endif 194 } 195 PetscOptionsHeadEnd(); 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 201 { 202 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 203 PetscInt m = A->rmap->n; 204 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 205 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 206 const MatScalar *Aa = a->a; 207 PetscInt *Mi, *Mj, Mnz; 208 PetscScalar *Ma; 209 210 PetscFunctionBegin; 211 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 212 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 213 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 214 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 215 PetscCall(PetscMalloc1(m + 1, &Mi)); 216 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 217 PetscCall(PetscMalloc1(Mnz, &Ma)); 218 Mi[0] = 0; 219 for (PetscInt i = 0; i < m; i++) { 220 PetscInt llen = Ai[i + 1] - Ai[i]; 221 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 222 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 223 Mj[Mi[i] + llen] = i; // diagonal entry 224 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 225 Mi[i + 1] = Mi[i] + llen + ulen; 226 } 227 // Copy M (L,U) from host to device 228 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 229 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 230 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 231 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 232 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 233 234 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 235 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 236 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 237 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 238 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 239 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 240 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 241 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 242 243 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 244 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 245 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 246 247 fillMode = CUSPARSE_FILL_MODE_UPPER; 248 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 249 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 250 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 251 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 252 253 // Allocate work vectors in SpSv 254 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 255 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 256 257 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 258 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 259 260 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 261 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 262 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 263 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 264 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 265 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 266 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 267 268 // Record for reuse 269 fs->csrRowPtr_h = Mi; 270 fs->csrVal_h = Ma; 271 PetscCall(PetscFree(Mj)); 272 } 273 // Copy the value 274 Mi = fs->csrRowPtr_h; 275 Ma = fs->csrVal_h; 276 Mnz = Mi[m]; 277 for (PetscInt i = 0; i < m; i++) { 278 PetscInt llen = Ai[i + 1] - Ai[i]; 279 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 280 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 281 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 282 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 283 } 284 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 285 286 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 287 if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 288 // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 289 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 290 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 291 } else 292 #endif 293 { 294 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 295 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 296 297 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 298 fs->updatedSpSVAnalysis = PETSC_TRUE; 299 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 300 } 301 } 302 PetscFunctionReturn(PETSC_SUCCESS); 303 } 304 #else 305 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 306 { 307 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 308 PetscInt n = A->rmap->n; 309 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 310 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 311 const PetscInt *ai = a->i, *aj = a->j, *vi; 312 const MatScalar *aa = a->a, *v; 313 PetscInt *AiLo, *AjLo; 314 PetscInt i, nz, nzLower, offset, rowOffset; 315 316 PetscFunctionBegin; 317 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 318 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 319 try { 320 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 321 nzLower = n + ai[n] - ai[1]; 322 if (!loTriFactor) { 323 PetscScalar *AALo; 324 325 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 326 327 /* Allocate Space for the lower triangular matrix */ 328 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 329 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 330 331 /* Fill the lower triangular matrix */ 332 AiLo[0] = (PetscInt)0; 333 AiLo[n] = nzLower; 334 AjLo[0] = (PetscInt)0; 335 AALo[0] = (MatScalar)1.0; 336 v = aa; 337 vi = aj; 338 offset = 1; 339 rowOffset = 1; 340 for (i = 1; i < n; i++) { 341 nz = ai[i + 1] - ai[i]; 342 /* additional 1 for the term on the diagonal */ 343 AiLo[i] = rowOffset; 344 rowOffset += nz + 1; 345 346 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 347 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 348 349 offset += nz; 350 AjLo[offset] = (PetscInt)i; 351 AALo[offset] = (MatScalar)1.0; 352 offset += 1; 353 354 v += nz; 355 vi += nz; 356 } 357 358 /* allocate space for the triangular factor information */ 359 PetscCall(PetscNew(&loTriFactor)); 360 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 361 /* Create the matrix description */ 362 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 363 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 364 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 365 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 366 #else 367 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 368 #endif 369 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 370 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 371 372 /* set the operation */ 373 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 374 375 /* set the matrix */ 376 loTriFactor->csrMat = new CsrMatrix; 377 loTriFactor->csrMat->num_rows = n; 378 loTriFactor->csrMat->num_cols = n; 379 loTriFactor->csrMat->num_entries = nzLower; 380 381 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 382 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 383 384 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 385 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 386 387 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 388 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 389 390 /* Create the solve analysis information */ 391 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 392 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 393 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 394 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 395 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 396 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 397 #endif 398 399 /* perform the solve analysis */ 400 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 401 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 402 PetscCallCUDA(WaitForCUDA()); 403 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 404 405 /* assign the pointer */ 406 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 407 loTriFactor->AA_h = AALo; 408 PetscCallCUDA(cudaFreeHost(AiLo)); 409 PetscCallCUDA(cudaFreeHost(AjLo)); 410 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 411 } else { /* update values only */ 412 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 413 /* Fill the lower triangular matrix */ 414 loTriFactor->AA_h[0] = 1.0; 415 v = aa; 416 vi = aj; 417 offset = 1; 418 for (i = 1; i < n; i++) { 419 nz = ai[i + 1] - ai[i]; 420 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 421 offset += nz; 422 loTriFactor->AA_h[offset] = 1.0; 423 offset += 1; 424 v += nz; 425 } 426 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 427 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 428 } 429 } catch (char *ex) { 430 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 431 } 432 } 433 PetscFunctionReturn(PETSC_SUCCESS); 434 } 435 436 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 437 { 438 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 439 PetscInt n = A->rmap->n; 440 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 441 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 442 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 443 const MatScalar *aa = a->a, *v; 444 PetscInt *AiUp, *AjUp; 445 PetscInt i, nz, nzUpper, offset; 446 447 PetscFunctionBegin; 448 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 449 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 450 try { 451 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 452 nzUpper = adiag[0] - adiag[n]; 453 if (!upTriFactor) { 454 PetscScalar *AAUp; 455 456 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 457 458 /* Allocate Space for the upper triangular matrix */ 459 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 460 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 461 462 /* Fill the upper triangular matrix */ 463 AiUp[0] = (PetscInt)0; 464 AiUp[n] = nzUpper; 465 offset = nzUpper; 466 for (i = n - 1; i >= 0; i--) { 467 v = aa + adiag[i + 1] + 1; 468 vi = aj + adiag[i + 1] + 1; 469 470 /* number of elements NOT on the diagonal */ 471 nz = adiag[i] - adiag[i + 1] - 1; 472 473 /* decrement the offset */ 474 offset -= (nz + 1); 475 476 /* first, set the diagonal elements */ 477 AjUp[offset] = (PetscInt)i; 478 AAUp[offset] = (MatScalar)1. / v[nz]; 479 AiUp[i] = AiUp[i + 1] - (nz + 1); 480 481 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 482 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 483 } 484 485 /* allocate space for the triangular factor information */ 486 PetscCall(PetscNew(&upTriFactor)); 487 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 488 489 /* Create the matrix description */ 490 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 491 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 492 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 493 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 494 #else 495 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 496 #endif 497 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 498 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 499 500 /* set the operation */ 501 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 502 503 /* set the matrix */ 504 upTriFactor->csrMat = new CsrMatrix; 505 upTriFactor->csrMat->num_rows = n; 506 upTriFactor->csrMat->num_cols = n; 507 upTriFactor->csrMat->num_entries = nzUpper; 508 509 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 510 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 511 512 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 513 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 514 515 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 516 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 517 518 /* Create the solve analysis information */ 519 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 520 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 521 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 522 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 523 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 524 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 525 #endif 526 527 /* perform the solve analysis */ 528 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 529 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 530 531 PetscCallCUDA(WaitForCUDA()); 532 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 533 534 /* assign the pointer */ 535 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 536 upTriFactor->AA_h = AAUp; 537 PetscCallCUDA(cudaFreeHost(AiUp)); 538 PetscCallCUDA(cudaFreeHost(AjUp)); 539 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 540 } else { 541 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 542 /* Fill the upper triangular matrix */ 543 offset = nzUpper; 544 for (i = n - 1; i >= 0; i--) { 545 v = aa + adiag[i + 1] + 1; 546 547 /* number of elements NOT on the diagonal */ 548 nz = adiag[i] - adiag[i + 1] - 1; 549 550 /* decrement the offset */ 551 offset -= (nz + 1); 552 553 /* first, set the diagonal elements */ 554 upTriFactor->AA_h[offset] = 1. / v[nz]; 555 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 556 } 557 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 558 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 559 } 560 } catch (char *ex) { 561 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 562 } 563 } 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 #endif 567 568 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 569 { 570 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 571 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 572 IS isrow = a->row, isicol = a->icol; 573 PetscBool row_identity, col_identity; 574 PetscInt n = A->rmap->n; 575 576 PetscFunctionBegin; 577 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 578 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 579 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 580 #else 581 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 582 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 583 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 584 #endif 585 586 cusparseTriFactors->nnz = a->nz; 587 588 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 589 /* lower triangular indices */ 590 PetscCall(ISIdentity(isrow, &row_identity)); 591 if (!row_identity && !cusparseTriFactors->rpermIndices) { 592 const PetscInt *r; 593 594 PetscCall(ISGetIndices(isrow, &r)); 595 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 596 cusparseTriFactors->rpermIndices->assign(r, r + n); 597 PetscCall(ISRestoreIndices(isrow, &r)); 598 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 599 } 600 601 /* upper triangular indices */ 602 PetscCall(ISIdentity(isicol, &col_identity)); 603 if (!col_identity && !cusparseTriFactors->cpermIndices) { 604 const PetscInt *c; 605 606 PetscCall(ISGetIndices(isicol, &c)); 607 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 608 cusparseTriFactors->cpermIndices->assign(c, c + n); 609 PetscCall(ISRestoreIndices(isicol, &c)); 610 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 611 } 612 PetscFunctionReturn(PETSC_SUCCESS); 613 } 614 615 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 616 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A) 617 { 618 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 619 PetscInt m = A->rmap->n; 620 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 621 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 622 const MatScalar *Aa = a->a; 623 PetscInt *Mj, Mnz; 624 PetscScalar *Ma, *D; 625 626 PetscFunctionBegin; 627 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 628 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 629 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 630 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 631 Mnz = Ai[m]; // Unz (with the unit diagonal) 632 PetscCall(PetscMalloc1(Mnz, &Ma)); 633 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 634 PetscCall(PetscMalloc1(m, &D)); // the diagonal 635 for (PetscInt i = 0; i < m; i++) { 636 PetscInt ulen = Ai[i + 1] - Ai[i]; 637 Mj[Ai[i]] = i; // diagonal entry 638 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 639 } 640 // Copy M (U) from host to device 641 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 642 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 643 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 644 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 645 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 646 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 647 648 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 649 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 650 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 651 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 652 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 653 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 654 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 655 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 656 657 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 658 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 659 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 660 661 // Allocate work vectors in SpSv 662 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 663 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 664 665 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 666 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 667 668 // Query buffer sizes for SpSV and then allocate buffers 669 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 670 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 671 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 672 673 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 674 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 675 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 676 677 // Record for reuse 678 fs->csrVal_h = Ma; 679 fs->diag_h = D; 680 PetscCall(PetscFree(Mj)); 681 } 682 // Copy the value 683 Ma = fs->csrVal_h; 684 D = fs->diag_h; 685 Mnz = Ai[m]; 686 for (PetscInt i = 0; i < m; i++) { 687 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 688 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 689 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 690 } 691 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 692 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 693 694 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 695 if (fs->updatedSpSVAnalysis) { 696 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 697 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 698 } else 699 #endif 700 { 701 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 702 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 703 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 704 fs->updatedSpSVAnalysis = PETSC_TRUE; 705 } 706 } 707 PetscFunctionReturn(PETSC_SUCCESS); 708 } 709 710 // Solve Ut D U x = b 711 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 712 { 713 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 714 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 715 const PetscScalar *barray; 716 PetscScalar *xarray; 717 thrust::device_ptr<const PetscScalar> bGPU; 718 thrust::device_ptr<PetscScalar> xGPU; 719 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 720 PetscInt m = A->rmap->n; 721 722 PetscFunctionBegin; 723 PetscCall(PetscLogGpuTimeBegin()); 724 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 725 PetscCall(VecCUDAGetArrayRead(b, &barray)); 726 xGPU = thrust::device_pointer_cast(xarray); 727 bGPU = thrust::device_pointer_cast(barray); 728 729 // Reorder b with the row permutation if needed, and wrap the result in fs->X 730 if (fs->rpermIndices) { 731 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 732 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 733 } else { 734 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 735 } 736 737 // Solve Ut Y = X 738 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 739 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 740 741 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 742 // It is basically a vector element-wise multiplication, but cublas does not have it! 743 #if CCCL_VERSION >= 3001000 744 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), cuda::std::multiplies<PetscScalar>())); 745 #else 746 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 747 #endif 748 749 // Solve U X = Y 750 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 751 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 752 } else { 753 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 754 } 755 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 756 757 // Reorder X with the column permutation if needed, and put the result back to x 758 if (fs->cpermIndices) { 759 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 760 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 761 } 762 763 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 764 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 765 PetscCall(PetscLogGpuTimeEnd()); 766 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 767 PetscFunctionReturn(PETSC_SUCCESS); 768 } 769 #else 770 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 771 { 772 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 773 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 774 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 775 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 776 PetscInt *AiUp, *AjUp; 777 PetscScalar *AAUp; 778 PetscScalar *AALo; 779 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 780 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 781 const PetscInt *ai = b->i, *aj = b->j, *vj; 782 const MatScalar *aa = b->a, *v; 783 784 PetscFunctionBegin; 785 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 786 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 787 try { 788 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 789 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 790 if (!upTriFactor && !loTriFactor) { 791 /* Allocate Space for the upper triangular matrix */ 792 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 793 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 794 795 /* Fill the upper triangular matrix */ 796 AiUp[0] = (PetscInt)0; 797 AiUp[n] = nzUpper; 798 offset = 0; 799 for (i = 0; i < n; i++) { 800 /* set the pointers */ 801 v = aa + ai[i]; 802 vj = aj + ai[i]; 803 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 804 805 /* first, set the diagonal elements */ 806 AjUp[offset] = (PetscInt)i; 807 AAUp[offset] = (MatScalar)1.0 / v[nz]; 808 AiUp[i] = offset; 809 AALo[offset] = (MatScalar)1.0 / v[nz]; 810 811 offset += 1; 812 if (nz > 0) { 813 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 814 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 815 for (j = offset; j < offset + nz; j++) { 816 AAUp[j] = -AAUp[j]; 817 AALo[j] = AAUp[j] / v[nz]; 818 } 819 offset += nz; 820 } 821 } 822 823 /* allocate space for the triangular factor information */ 824 PetscCall(PetscNew(&upTriFactor)); 825 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 826 827 /* Create the matrix description */ 828 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 829 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 830 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 831 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 832 #else 833 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 834 #endif 835 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 836 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 837 838 /* set the matrix */ 839 upTriFactor->csrMat = new CsrMatrix; 840 upTriFactor->csrMat->num_rows = A->rmap->n; 841 upTriFactor->csrMat->num_cols = A->cmap->n; 842 upTriFactor->csrMat->num_entries = a->nz; 843 844 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 845 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 846 847 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 849 850 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 852 853 /* set the operation */ 854 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 855 856 /* Create the solve analysis information */ 857 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 858 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 859 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 860 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 861 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 862 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 863 #endif 864 865 /* perform the solve analysis */ 866 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 867 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 868 869 PetscCallCUDA(WaitForCUDA()); 870 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 871 872 /* assign the pointer */ 873 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 874 875 /* allocate space for the triangular factor information */ 876 PetscCall(PetscNew(&loTriFactor)); 877 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 878 879 /* Create the matrix description */ 880 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 881 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 882 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 883 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 884 #else 885 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 886 #endif 887 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 888 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 889 890 /* set the operation */ 891 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 892 893 /* set the matrix */ 894 loTriFactor->csrMat = new CsrMatrix; 895 loTriFactor->csrMat->num_rows = A->rmap->n; 896 loTriFactor->csrMat->num_cols = A->cmap->n; 897 loTriFactor->csrMat->num_entries = a->nz; 898 899 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 900 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 901 902 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 903 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 904 905 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 906 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 907 908 /* Create the solve analysis information */ 909 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 910 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 911 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 912 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 913 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 914 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 915 #endif 916 917 /* perform the solve analysis */ 918 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 919 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 920 921 PetscCallCUDA(WaitForCUDA()); 922 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 923 924 /* assign the pointer */ 925 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 926 927 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 928 PetscCallCUDA(cudaFreeHost(AiUp)); 929 PetscCallCUDA(cudaFreeHost(AjUp)); 930 } else { 931 /* Fill the upper triangular matrix */ 932 offset = 0; 933 for (i = 0; i < n; i++) { 934 /* set the pointers */ 935 v = aa + ai[i]; 936 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 937 938 /* first, set the diagonal elements */ 939 AAUp[offset] = 1.0 / v[nz]; 940 AALo[offset] = 1.0 / v[nz]; 941 942 offset += 1; 943 if (nz > 0) { 944 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 945 for (j = offset; j < offset + nz; j++) { 946 AAUp[j] = -AAUp[j]; 947 AALo[j] = AAUp[j] / v[nz]; 948 } 949 offset += nz; 950 } 951 } 952 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 953 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 954 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 955 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 956 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 957 } 958 PetscCallCUDA(cudaFreeHost(AAUp)); 959 PetscCallCUDA(cudaFreeHost(AALo)); 960 } catch (char *ex) { 961 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 962 } 963 } 964 PetscFunctionReturn(PETSC_SUCCESS); 965 } 966 #endif 967 968 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 969 { 970 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 971 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 972 IS ip = a->row; 973 PetscBool perm_identity; 974 PetscInt n = A->rmap->n; 975 976 PetscFunctionBegin; 977 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 978 979 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 980 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A)); 981 #else 982 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 983 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 984 #endif 985 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 986 987 A->offloadmask = PETSC_OFFLOAD_BOTH; 988 989 /* lower triangular indices */ 990 PetscCall(ISIdentity(ip, &perm_identity)); 991 if (!perm_identity) { 992 IS iip; 993 const PetscInt *irip, *rip; 994 995 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 996 PetscCall(ISGetIndices(iip, &irip)); 997 PetscCall(ISGetIndices(ip, &rip)); 998 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 999 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1000 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1001 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1002 PetscCall(ISRestoreIndices(iip, &irip)); 1003 PetscCall(ISDestroy(&iip)); 1004 PetscCall(ISRestoreIndices(ip, &rip)); 1005 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1006 } 1007 PetscFunctionReturn(PETSC_SUCCESS); 1008 } 1009 1010 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1011 { 1012 PetscFunctionBegin; 1013 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1014 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1015 B->offloadmask = PETSC_OFFLOAD_CPU; 1016 1017 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1018 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1019 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1020 #else 1021 /* determine which version of MatSolve needs to be used. */ 1022 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1023 IS ip = b->row; 1024 PetscBool perm_identity; 1025 1026 PetscCall(ISIdentity(ip, &perm_identity)); 1027 if (perm_identity) { 1028 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1029 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1030 } else { 1031 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1032 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1033 } 1034 #endif 1035 B->ops->matsolve = NULL; 1036 B->ops->matsolvetranspose = NULL; 1037 1038 /* get the triangular factors */ 1039 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1040 PetscFunctionReturn(PETSC_SUCCESS); 1041 } 1042 1043 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1044 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1045 { 1046 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1047 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1048 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1049 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1050 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1051 cusparseIndexBase_t indexBase; 1052 cusparseMatrixType_t matrixType; 1053 cusparseFillMode_t fillMode; 1054 cusparseDiagType_t diagType; 1055 1056 PetscFunctionBegin; 1057 /* allocate space for the transpose of the lower triangular factor */ 1058 PetscCall(PetscNew(&loTriFactorT)); 1059 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1060 1061 /* set the matrix descriptors of the lower triangular factor */ 1062 matrixType = cusparseGetMatType(loTriFactor->descr); 1063 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1064 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1065 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1066 1067 /* Create the matrix description */ 1068 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1069 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1070 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1071 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1072 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1073 1074 /* set the operation */ 1075 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1076 1077 /* allocate GPU space for the CSC of the lower triangular factor*/ 1078 loTriFactorT->csrMat = new CsrMatrix; 1079 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1080 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1081 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1082 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1083 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1084 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1085 1086 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1087 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1088 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1089 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1090 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1091 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1092 #endif 1093 1094 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1095 { 1096 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1097 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1098 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1099 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1100 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1101 #else 1102 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1103 #endif 1104 PetscCallCUSPARSE(stat); 1105 } 1106 1107 PetscCallCUDA(WaitForCUDA()); 1108 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1109 1110 /* Create the solve analysis information */ 1111 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1112 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1113 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1114 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1115 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1116 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1117 #endif 1118 1119 /* perform the solve analysis */ 1120 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1121 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1122 1123 PetscCallCUDA(WaitForCUDA()); 1124 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1125 1126 /* assign the pointer */ 1127 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1128 1129 /*********************************************/ 1130 /* Now the Transpose of the Upper Tri Factor */ 1131 /*********************************************/ 1132 1133 /* allocate space for the transpose of the upper triangular factor */ 1134 PetscCall(PetscNew(&upTriFactorT)); 1135 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1136 1137 /* set the matrix descriptors of the upper triangular factor */ 1138 matrixType = cusparseGetMatType(upTriFactor->descr); 1139 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1140 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1141 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1142 1143 /* Create the matrix description */ 1144 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1145 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1146 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1147 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1148 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1149 1150 /* set the operation */ 1151 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1152 1153 /* allocate GPU space for the CSC of the upper triangular factor*/ 1154 upTriFactorT->csrMat = new CsrMatrix; 1155 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1156 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1157 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1158 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1159 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1160 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1161 1162 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1163 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1164 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1165 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1166 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1167 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1168 #endif 1169 1170 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1171 { 1172 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1173 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1174 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1175 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1176 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1177 #else 1178 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1179 #endif 1180 PetscCallCUSPARSE(stat); 1181 } 1182 1183 PetscCallCUDA(WaitForCUDA()); 1184 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1185 1186 /* Create the solve analysis information */ 1187 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1188 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1189 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1190 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1191 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1192 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1193 #endif 1194 1195 /* perform the solve analysis */ 1196 /* christ, would it have killed you to put this stuff in a function????????? */ 1197 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1198 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1199 1200 PetscCallCUDA(WaitForCUDA()); 1201 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1202 1203 /* assign the pointer */ 1204 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1205 PetscFunctionReturn(PETSC_SUCCESS); 1206 } 1207 #endif 1208 1209 struct PetscScalarToPetscInt { 1210 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1211 }; 1212 1213 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1214 { 1215 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1216 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1217 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1218 cusparseStatus_t stat; 1219 cusparseIndexBase_t indexBase; 1220 1221 PetscFunctionBegin; 1222 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1223 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1224 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1225 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1226 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1227 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1228 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1229 PetscCall(PetscLogGpuTimeBegin()); 1230 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1231 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1232 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1233 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1234 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1235 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1236 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1237 1238 /* set alpha and beta */ 1239 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1240 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1241 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1242 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1243 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1244 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1245 1246 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1247 CsrMatrix *matrixT = new CsrMatrix; 1248 matstructT->mat = matrixT; 1249 matrixT->num_rows = A->cmap->n; 1250 matrixT->num_cols = A->rmap->n; 1251 matrixT->num_entries = a->nz; 1252 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1253 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1254 matrixT->values = new THRUSTARRAY(a->nz); 1255 1256 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1257 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1258 1259 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1260 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1261 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1262 indexBase, cusparse_scalartype); 1263 PetscCallCUSPARSE(stat); 1264 #else 1265 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1266 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1267 1268 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1269 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1270 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1271 */ 1272 if (matrixT->num_entries) { 1273 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1274 PetscCallCUSPARSE(stat); 1275 1276 } else { 1277 matstructT->matDescr = NULL; 1278 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1279 } 1280 #endif 1281 #endif 1282 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1283 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1284 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1285 #else 1286 CsrMatrix *temp = new CsrMatrix; 1287 CsrMatrix *tempT = new CsrMatrix; 1288 /* First convert HYB to CSR */ 1289 temp->num_rows = A->rmap->n; 1290 temp->num_cols = A->cmap->n; 1291 temp->num_entries = a->nz; 1292 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1293 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1294 temp->values = new THRUSTARRAY(a->nz); 1295 1296 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1297 PetscCallCUSPARSE(stat); 1298 1299 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1300 tempT->num_rows = A->rmap->n; 1301 tempT->num_cols = A->cmap->n; 1302 tempT->num_entries = a->nz; 1303 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1304 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1305 tempT->values = new THRUSTARRAY(a->nz); 1306 1307 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1308 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1309 PetscCallCUSPARSE(stat); 1310 1311 /* Last, convert CSC to HYB */ 1312 cusparseHybMat_t hybMat; 1313 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1314 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1315 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1316 PetscCallCUSPARSE(stat); 1317 1318 /* assign the pointer */ 1319 matstructT->mat = hybMat; 1320 A->transupdated = PETSC_TRUE; 1321 /* delete temporaries */ 1322 if (tempT) { 1323 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1324 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1325 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1326 delete (CsrMatrix *)tempT; 1327 } 1328 if (temp) { 1329 if (temp->values) delete (THRUSTARRAY *)temp->values; 1330 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1331 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1332 delete (CsrMatrix *)temp; 1333 } 1334 #endif 1335 } 1336 } 1337 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1338 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1339 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1340 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1341 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1342 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1343 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1344 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1345 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1346 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1347 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1348 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1349 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1350 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1351 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1352 } 1353 if (!cusparsestruct->csr2csc_i) { 1354 THRUSTARRAY csr2csc_a(matrix->num_entries); 1355 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1356 1357 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1358 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1359 void *csr2cscBuffer; 1360 size_t csr2cscBufferSize; 1361 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1362 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1363 PetscCallCUSPARSE(stat); 1364 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1365 #endif 1366 1367 if (matrix->num_entries) { 1368 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1369 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1370 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1371 1372 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1373 should be filled with indexBase. So I just take a shortcut here. 1374 */ 1375 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1376 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1377 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1378 PetscCallCUSPARSE(stat); 1379 #else 1380 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1381 PetscCallCUSPARSE(stat); 1382 #endif 1383 } else { 1384 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1385 } 1386 1387 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1388 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1389 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1390 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1391 #endif 1392 } 1393 PetscCallThrust( 1394 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1395 } 1396 PetscCall(PetscLogGpuTimeEnd()); 1397 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1398 /* the compressed row indices is not used for matTranspose */ 1399 matstructT->cprowIndices = NULL; 1400 /* assign the pointer */ 1401 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1402 A->transupdated = PETSC_TRUE; 1403 PetscFunctionReturn(PETSC_SUCCESS); 1404 } 1405 1406 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1407 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1408 { 1409 const PetscScalar *barray; 1410 PetscScalar *xarray; 1411 thrust::device_ptr<const PetscScalar> bGPU; 1412 thrust::device_ptr<PetscScalar> xGPU; 1413 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1414 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1415 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1416 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1417 PetscInt m = A->rmap->n; 1418 1419 PetscFunctionBegin; 1420 PetscCall(PetscLogGpuTimeBegin()); 1421 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1422 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1423 xGPU = thrust::device_pointer_cast(xarray); 1424 bGPU = thrust::device_pointer_cast(barray); 1425 1426 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1427 if (fs->rpermIndices) { 1428 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1429 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1430 } else { 1431 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1432 } 1433 1434 // Solve L Y = X 1435 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1436 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1437 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1438 1439 // Solve U X = Y 1440 if (fs->cpermIndices) { 1441 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1442 } else { 1443 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1444 } 1445 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1446 1447 // Reorder X with the column permutation if needed, and put the result back to x 1448 if (fs->cpermIndices) { 1449 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1450 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1451 } 1452 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1453 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1454 PetscCall(PetscLogGpuTimeEnd()); 1455 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1456 PetscFunctionReturn(PETSC_SUCCESS); 1457 } 1458 1459 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1460 { 1461 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1462 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1463 const PetscScalar *barray; 1464 PetscScalar *xarray; 1465 thrust::device_ptr<const PetscScalar> bGPU; 1466 thrust::device_ptr<PetscScalar> xGPU; 1467 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1468 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1469 PetscInt m = A->rmap->n; 1470 1471 PetscFunctionBegin; 1472 PetscCall(PetscLogGpuTimeBegin()); 1473 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1474 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1475 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1476 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1477 1478 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1479 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1480 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1481 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1482 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1483 } 1484 1485 if (!fs->updatedTransposeSpSVAnalysis) { 1486 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1487 1488 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1489 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1490 } 1491 1492 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1493 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1494 xGPU = thrust::device_pointer_cast(xarray); 1495 bGPU = thrust::device_pointer_cast(barray); 1496 1497 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1498 if (fs->rpermIndices) { 1499 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1500 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1501 } else { 1502 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1503 } 1504 1505 // Solve Ut Y = X 1506 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1507 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1508 1509 // Solve Lt X = Y 1510 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1511 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1512 } else { 1513 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1514 } 1515 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1516 1517 // Reorder X with the column permutation if needed, and put the result back to x 1518 if (fs->cpermIndices) { 1519 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1520 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1521 } 1522 1523 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1524 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1525 PetscCall(PetscLogGpuTimeEnd()); 1526 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1527 PetscFunctionReturn(PETSC_SUCCESS); 1528 } 1529 #else 1530 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1531 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1532 { 1533 PetscInt n = xx->map->n; 1534 const PetscScalar *barray; 1535 PetscScalar *xarray; 1536 thrust::device_ptr<const PetscScalar> bGPU; 1537 thrust::device_ptr<PetscScalar> xGPU; 1538 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1539 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1540 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1541 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1542 1543 PetscFunctionBegin; 1544 /* Analyze the matrix and create the transpose ... on the fly */ 1545 if (!loTriFactorT && !upTriFactorT) { 1546 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1547 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1548 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1549 } 1550 1551 /* Get the GPU pointers */ 1552 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1553 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1554 xGPU = thrust::device_pointer_cast(xarray); 1555 bGPU = thrust::device_pointer_cast(barray); 1556 1557 PetscCall(PetscLogGpuTimeBegin()); 1558 /* First, reorder with the row permutation */ 1559 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1560 1561 /* First, solve U */ 1562 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1563 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1564 1565 /* Then, solve L */ 1566 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1567 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1568 1569 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1570 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1571 1572 /* Copy the temporary to the full solution. */ 1573 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1574 1575 /* restore */ 1576 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1577 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1578 PetscCall(PetscLogGpuTimeEnd()); 1579 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1580 PetscFunctionReturn(PETSC_SUCCESS); 1581 } 1582 1583 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1584 { 1585 const PetscScalar *barray; 1586 PetscScalar *xarray; 1587 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1588 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1589 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1590 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1591 1592 PetscFunctionBegin; 1593 /* Analyze the matrix and create the transpose ... on the fly */ 1594 if (!loTriFactorT && !upTriFactorT) { 1595 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1596 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1597 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1598 } 1599 1600 /* Get the GPU pointers */ 1601 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1602 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1603 1604 PetscCall(PetscLogGpuTimeBegin()); 1605 /* First, solve U */ 1606 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1607 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1608 1609 /* Then, solve L */ 1610 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1611 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1612 1613 /* restore */ 1614 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1615 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1616 PetscCall(PetscLogGpuTimeEnd()); 1617 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1618 PetscFunctionReturn(PETSC_SUCCESS); 1619 } 1620 1621 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1622 { 1623 const PetscScalar *barray; 1624 PetscScalar *xarray; 1625 thrust::device_ptr<const PetscScalar> bGPU; 1626 thrust::device_ptr<PetscScalar> xGPU; 1627 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1628 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1629 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1630 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1631 1632 PetscFunctionBegin; 1633 /* Get the GPU pointers */ 1634 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1635 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1636 xGPU = thrust::device_pointer_cast(xarray); 1637 bGPU = thrust::device_pointer_cast(barray); 1638 1639 PetscCall(PetscLogGpuTimeBegin()); 1640 /* First, reorder with the row permutation */ 1641 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1642 1643 /* Next, solve L */ 1644 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1645 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1646 1647 /* Then, solve U */ 1648 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1649 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1650 1651 /* Last, reorder with the column permutation */ 1652 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1653 1654 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1655 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1656 PetscCall(PetscLogGpuTimeEnd()); 1657 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1658 PetscFunctionReturn(PETSC_SUCCESS); 1659 } 1660 1661 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1662 { 1663 const PetscScalar *barray; 1664 PetscScalar *xarray; 1665 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1666 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1667 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1668 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1669 1670 PetscFunctionBegin; 1671 /* Get the GPU pointers */ 1672 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1673 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1674 1675 PetscCall(PetscLogGpuTimeBegin()); 1676 /* First, solve L */ 1677 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1678 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1679 1680 /* Next, solve U */ 1681 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1682 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1683 1684 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1685 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1686 PetscCall(PetscLogGpuTimeEnd()); 1687 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1688 PetscFunctionReturn(PETSC_SUCCESS); 1689 } 1690 #endif 1691 1692 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1693 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1694 { 1695 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1696 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1697 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1698 CsrMatrix *Acsr; 1699 PetscInt m, nz; 1700 PetscBool flg; 1701 1702 PetscFunctionBegin; 1703 if (PetscDefined(USE_DEBUG)) { 1704 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1705 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1706 } 1707 1708 /* Copy A's value to fact */ 1709 m = fact->rmap->n; 1710 nz = aij->nz; 1711 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1712 Acsr = (CsrMatrix *)Acusp->mat->mat; 1713 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1714 1715 PetscCall(PetscLogGpuTimeBegin()); 1716 /* Factorize fact inplace */ 1717 if (m) 1718 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1719 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1720 if (PetscDefined(USE_DEBUG)) { 1721 int numerical_zero; 1722 cusparseStatus_t status; 1723 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1724 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1725 } 1726 1727 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1728 if (fs->updatedSpSVAnalysis) { 1729 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1730 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1731 } else 1732 #endif 1733 { 1734 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1735 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1736 */ 1737 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1738 1739 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1740 1741 fs->updatedSpSVAnalysis = PETSC_TRUE; 1742 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1743 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1744 } 1745 1746 fact->offloadmask = PETSC_OFFLOAD_GPU; 1747 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1748 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1749 fact->ops->matsolve = NULL; 1750 fact->ops->matsolvetranspose = NULL; 1751 PetscCall(PetscLogGpuTimeEnd()); 1752 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1753 PetscFunctionReturn(PETSC_SUCCESS); 1754 } 1755 1756 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1757 { 1758 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1759 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1760 PetscInt m, nz; 1761 1762 PetscFunctionBegin; 1763 if (PetscDefined(USE_DEBUG)) { 1764 PetscInt i; 1765 PetscBool flg, missing; 1766 1767 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1768 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1769 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1770 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1771 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1772 } 1773 1774 /* Free the old stale stuff */ 1775 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1776 1777 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1778 but they will not be used. Allocate them just for easy debugging. 1779 */ 1780 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1781 1782 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1783 fact->factortype = MAT_FACTOR_ILU; 1784 fact->info.factor_mallocs = 0; 1785 fact->info.fill_ratio_given = info->fill; 1786 fact->info.fill_ratio_needed = 1.0; 1787 1788 aij->row = NULL; 1789 aij->col = NULL; 1790 1791 /* ====================================================================== */ 1792 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1793 /* We'll do in-place factorization on fact */ 1794 /* ====================================================================== */ 1795 const int *Ai, *Aj; 1796 1797 m = fact->rmap->n; 1798 nz = aij->nz; 1799 1800 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1801 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1802 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1803 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1804 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1805 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1806 1807 /* ====================================================================== */ 1808 /* Create descriptors for M, L, U */ 1809 /* ====================================================================== */ 1810 cusparseFillMode_t fillMode; 1811 cusparseDiagType_t diagType; 1812 1813 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1814 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1815 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1816 1817 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1818 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1819 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1820 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1821 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1822 */ 1823 fillMode = CUSPARSE_FILL_MODE_LOWER; 1824 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1825 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1826 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1827 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1828 1829 fillMode = CUSPARSE_FILL_MODE_UPPER; 1830 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1831 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1832 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1833 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1834 1835 /* ========================================================================= */ 1836 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1837 /* ========================================================================= */ 1838 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1839 if (m) 1840 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1841 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1842 1843 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1844 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1845 1846 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1847 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1848 1849 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1850 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1851 1852 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1853 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1854 1855 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1856 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1857 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1858 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1859 */ 1860 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1861 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1862 fs->spsvBuffer_L = fs->factBuffer_M; 1863 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1864 } else { 1865 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1866 fs->spsvBuffer_U = fs->factBuffer_M; 1867 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1868 } 1869 1870 /* ========================================================================== */ 1871 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1872 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1873 /* ========================================================================== */ 1874 int structural_zero; 1875 cusparseStatus_t status; 1876 1877 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1878 if (m) 1879 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1880 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1881 if (PetscDefined(USE_DEBUG)) { 1882 /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1883 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1884 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1885 } 1886 1887 /* Estimate FLOPs of the numeric factorization */ 1888 { 1889 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1890 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1891 PetscLogDouble flops = 0.0; 1892 1893 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1894 Ai = Aseq->i; 1895 Adiag = Aseq->diag; 1896 for (PetscInt i = 0; i < m; i++) { 1897 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1898 nzRow = Ai[i + 1] - Ai[i]; 1899 nzLeft = Adiag[i] - Ai[i]; 1900 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1901 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1902 */ 1903 nzLeft = (nzRow - 1) / 2; 1904 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1905 } 1906 } 1907 fs->numericFactFlops = flops; 1908 } 1909 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1910 PetscFunctionReturn(PETSC_SUCCESS); 1911 } 1912 1913 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1914 { 1915 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1916 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1917 const PetscScalar *barray; 1918 PetscScalar *xarray; 1919 1920 PetscFunctionBegin; 1921 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1922 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1923 PetscCall(PetscLogGpuTimeBegin()); 1924 1925 /* Solve L*y = b */ 1926 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1927 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1928 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1929 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1930 1931 /* Solve Lt*x = y */ 1932 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1933 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1934 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1935 1936 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1937 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1938 1939 PetscCall(PetscLogGpuTimeEnd()); 1940 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1941 PetscFunctionReturn(PETSC_SUCCESS); 1942 } 1943 1944 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1945 { 1946 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1947 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1948 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1949 CsrMatrix *Acsr; 1950 PetscInt m, nz; 1951 PetscBool flg; 1952 1953 PetscFunctionBegin; 1954 if (PetscDefined(USE_DEBUG)) { 1955 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1956 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1957 } 1958 1959 /* Copy A's value to fact */ 1960 m = fact->rmap->n; 1961 nz = aij->nz; 1962 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1963 Acsr = (CsrMatrix *)Acusp->mat->mat; 1964 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1965 1966 /* Factorize fact inplace */ 1967 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1968 csric02() only takes the lower triangular part of matrix A to perform factorization. 1969 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1970 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1971 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1972 */ 1973 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1974 if (PetscDefined(USE_DEBUG)) { 1975 int numerical_zero; 1976 cusparseStatus_t status; 1977 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1978 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1979 } 1980 1981 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1982 if (fs->updatedSpSVAnalysis) { 1983 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1984 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1985 } else 1986 #endif 1987 { 1988 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1989 1990 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1991 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1992 */ 1993 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1994 fs->updatedSpSVAnalysis = PETSC_TRUE; 1995 } 1996 1997 fact->offloadmask = PETSC_OFFLOAD_GPU; 1998 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1999 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2000 fact->ops->matsolve = NULL; 2001 fact->ops->matsolvetranspose = NULL; 2002 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2003 PetscFunctionReturn(PETSC_SUCCESS); 2004 } 2005 2006 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2007 { 2008 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2009 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2010 PetscInt m, nz; 2011 2012 PetscFunctionBegin; 2013 if (PetscDefined(USE_DEBUG)) { 2014 PetscInt i; 2015 PetscBool flg, missing; 2016 2017 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2018 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2019 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2020 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2021 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2022 } 2023 2024 /* Free the old stale stuff */ 2025 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2026 2027 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2028 but they will not be used. Allocate them just for easy debugging. 2029 */ 2030 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2031 2032 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2033 fact->factortype = MAT_FACTOR_ICC; 2034 fact->info.factor_mallocs = 0; 2035 fact->info.fill_ratio_given = info->fill; 2036 fact->info.fill_ratio_needed = 1.0; 2037 2038 aij->row = NULL; 2039 aij->col = NULL; 2040 2041 /* ====================================================================== */ 2042 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2043 /* We'll do in-place factorization on fact */ 2044 /* ====================================================================== */ 2045 const int *Ai, *Aj; 2046 2047 m = fact->rmap->n; 2048 nz = aij->nz; 2049 2050 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2051 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2052 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2053 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2054 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2055 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2056 2057 /* ====================================================================== */ 2058 /* Create mat descriptors for M, L */ 2059 /* ====================================================================== */ 2060 cusparseFillMode_t fillMode; 2061 cusparseDiagType_t diagType; 2062 2063 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2064 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2065 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2066 2067 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2068 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2069 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2070 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2071 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2072 */ 2073 fillMode = CUSPARSE_FILL_MODE_LOWER; 2074 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2075 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2076 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2077 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2078 2079 /* ========================================================================= */ 2080 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2081 /* ========================================================================= */ 2082 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2083 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2084 2085 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2086 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2087 2088 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2089 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2090 2091 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2092 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2093 2094 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2095 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2096 2097 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2098 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2099 */ 2100 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2101 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2102 fs->spsvBuffer_L = fs->factBuffer_M; 2103 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2104 } else { 2105 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2106 fs->spsvBuffer_Lt = fs->factBuffer_M; 2107 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2108 } 2109 2110 /* ========================================================================== */ 2111 /* Perform analysis of ic0 on M */ 2112 /* The lower triangular part of M has the same sparsity pattern as L */ 2113 /* ========================================================================== */ 2114 int structural_zero; 2115 cusparseStatus_t status; 2116 2117 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2118 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2119 if (PetscDefined(USE_DEBUG)) { 2120 /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2121 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2122 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2123 } 2124 2125 /* Estimate FLOPs of the numeric factorization */ 2126 { 2127 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2128 PetscInt *Ai, nzRow, nzLeft; 2129 PetscLogDouble flops = 0.0; 2130 2131 Ai = Aseq->i; 2132 for (PetscInt i = 0; i < m; i++) { 2133 nzRow = Ai[i + 1] - Ai[i]; 2134 if (nzRow > 1) { 2135 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2136 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2137 */ 2138 nzLeft = (nzRow - 1) / 2; 2139 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2140 } 2141 } 2142 fs->numericFactFlops = flops; 2143 } 2144 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2145 PetscFunctionReturn(PETSC_SUCCESS); 2146 } 2147 #endif 2148 2149 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2150 { 2151 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2152 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2153 2154 PetscFunctionBegin; 2155 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2156 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2157 B->offloadmask = PETSC_OFFLOAD_CPU; 2158 2159 if (!cusparsestruct->use_cpu_solve) { 2160 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2161 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2162 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2163 #else 2164 /* determine which version of MatSolve needs to be used. */ 2165 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2166 IS isrow = b->row, iscol = b->col; 2167 PetscBool row_identity, col_identity; 2168 2169 PetscCall(ISIdentity(isrow, &row_identity)); 2170 PetscCall(ISIdentity(iscol, &col_identity)); 2171 if (row_identity && col_identity) { 2172 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2173 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2174 } else { 2175 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2176 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2177 } 2178 #endif 2179 } 2180 B->ops->matsolve = NULL; 2181 B->ops->matsolvetranspose = NULL; 2182 2183 /* get the triangular factors */ 2184 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2185 PetscFunctionReturn(PETSC_SUCCESS); 2186 } 2187 2188 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2189 { 2190 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2191 2192 PetscFunctionBegin; 2193 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2194 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2195 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2196 PetscFunctionReturn(PETSC_SUCCESS); 2197 } 2198 2199 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2200 { 2201 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2202 2203 PetscFunctionBegin; 2204 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2205 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2206 if (!info->factoronhost) { 2207 PetscCall(ISIdentity(isrow, &row_identity)); 2208 PetscCall(ISIdentity(iscol, &col_identity)); 2209 } 2210 if (!info->levels && row_identity && col_identity) { 2211 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2212 } else 2213 #endif 2214 { 2215 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2216 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2217 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2218 } 2219 PetscFunctionReturn(PETSC_SUCCESS); 2220 } 2221 2222 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2223 { 2224 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2225 2226 PetscFunctionBegin; 2227 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2228 PetscBool perm_identity = PETSC_FALSE; 2229 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2230 if (!info->levels && perm_identity) { 2231 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2232 } else 2233 #endif 2234 { 2235 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2236 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2237 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2238 } 2239 PetscFunctionReturn(PETSC_SUCCESS); 2240 } 2241 2242 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2243 { 2244 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2245 2246 PetscFunctionBegin; 2247 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2248 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2249 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2250 PetscFunctionReturn(PETSC_SUCCESS); 2251 } 2252 2253 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2254 { 2255 PetscFunctionBegin; 2256 *type = MATSOLVERCUSPARSE; 2257 PetscFunctionReturn(PETSC_SUCCESS); 2258 } 2259 2260 /*MC 2261 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2262 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2263 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2264 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2265 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2266 algorithms are not recommended. This class does NOT support direct solver operations. 2267 2268 Level: beginner 2269 2270 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2271 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2272 M*/ 2273 2274 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2275 { 2276 PetscInt n = A->rmap->n; 2277 2278 PetscFunctionBegin; 2279 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2280 PetscCall(MatSetSizes(*B, n, n, n, n)); 2281 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2282 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2283 2284 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2285 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2286 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2287 if (!A->boundtocpu) { 2288 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2289 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2290 } else { 2291 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2292 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2293 } 2294 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2295 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2296 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2297 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2298 if (!A->boundtocpu) { 2299 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2300 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2301 } else { 2302 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2303 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2304 } 2305 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2306 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2307 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2308 2309 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2310 (*B)->canuseordering = PETSC_TRUE; 2311 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2312 PetscFunctionReturn(PETSC_SUCCESS); 2313 } 2314 2315 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2316 { 2317 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2318 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2319 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2320 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2321 #endif 2322 2323 PetscFunctionBegin; 2324 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2325 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2326 if (A->factortype == MAT_FACTOR_NONE) { 2327 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2328 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2329 } 2330 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2331 else if (fs->csrVal) { 2332 /* We have a factorized matrix on device and are able to copy it to host */ 2333 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2334 } 2335 #endif 2336 else 2337 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2338 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2339 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2340 A->offloadmask = PETSC_OFFLOAD_BOTH; 2341 } 2342 PetscFunctionReturn(PETSC_SUCCESS); 2343 } 2344 2345 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2346 { 2347 PetscFunctionBegin; 2348 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2349 *array = ((Mat_SeqAIJ *)A->data)->a; 2350 PetscFunctionReturn(PETSC_SUCCESS); 2351 } 2352 2353 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2354 { 2355 PetscFunctionBegin; 2356 A->offloadmask = PETSC_OFFLOAD_CPU; 2357 *array = NULL; 2358 PetscFunctionReturn(PETSC_SUCCESS); 2359 } 2360 2361 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2362 { 2363 PetscFunctionBegin; 2364 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2365 *array = ((Mat_SeqAIJ *)A->data)->a; 2366 PetscFunctionReturn(PETSC_SUCCESS); 2367 } 2368 2369 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2370 { 2371 PetscFunctionBegin; 2372 *array = NULL; 2373 PetscFunctionReturn(PETSC_SUCCESS); 2374 } 2375 2376 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2377 { 2378 PetscFunctionBegin; 2379 *array = ((Mat_SeqAIJ *)A->data)->a; 2380 PetscFunctionReturn(PETSC_SUCCESS); 2381 } 2382 2383 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2384 { 2385 PetscFunctionBegin; 2386 A->offloadmask = PETSC_OFFLOAD_CPU; 2387 *array = NULL; 2388 PetscFunctionReturn(PETSC_SUCCESS); 2389 } 2390 2391 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2392 { 2393 Mat_SeqAIJCUSPARSE *cusp; 2394 CsrMatrix *matrix; 2395 2396 PetscFunctionBegin; 2397 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2398 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2399 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2400 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2401 matrix = (CsrMatrix *)cusp->mat->mat; 2402 2403 if (i) { 2404 #if !defined(PETSC_USE_64BIT_INDICES) 2405 *i = matrix->row_offsets->data().get(); 2406 #else 2407 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2408 #endif 2409 } 2410 if (j) { 2411 #if !defined(PETSC_USE_64BIT_INDICES) 2412 *j = matrix->column_indices->data().get(); 2413 #else 2414 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2415 #endif 2416 } 2417 if (a) *a = matrix->values->data().get(); 2418 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2419 PetscFunctionReturn(PETSC_SUCCESS); 2420 } 2421 2422 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2423 { 2424 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2425 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2426 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2427 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2428 cusparseStatus_t stat; 2429 PetscBool both = PETSC_TRUE; 2430 2431 PetscFunctionBegin; 2432 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2433 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2434 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2435 CsrMatrix *matrix; 2436 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2437 2438 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2439 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2440 matrix->values->assign(a->a, a->a + a->nz); 2441 PetscCallCUDA(WaitForCUDA()); 2442 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2443 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2444 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2445 } else { 2446 PetscInt nnz; 2447 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2448 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2449 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2450 delete cusparsestruct->workVector; 2451 delete cusparsestruct->rowoffsets_gpu; 2452 cusparsestruct->workVector = NULL; 2453 cusparsestruct->rowoffsets_gpu = NULL; 2454 try { 2455 if (a->compressedrow.use) { 2456 m = a->compressedrow.nrows; 2457 ii = a->compressedrow.i; 2458 ridx = a->compressedrow.rindex; 2459 } else { 2460 m = A->rmap->n; 2461 ii = a->i; 2462 ridx = NULL; 2463 } 2464 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2465 if (!a->a) { 2466 nnz = ii[m]; 2467 both = PETSC_FALSE; 2468 } else nnz = a->nz; 2469 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2470 2471 /* create cusparse matrix */ 2472 cusparsestruct->nrows = m; 2473 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2474 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2475 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2476 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2477 2478 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2479 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2480 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2481 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2482 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2483 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2484 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2485 2486 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2487 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2488 /* set the matrix */ 2489 CsrMatrix *mat = new CsrMatrix; 2490 mat->num_rows = m; 2491 mat->num_cols = A->cmap->n; 2492 mat->num_entries = nnz; 2493 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2494 mat->row_offsets->assign(ii, ii + m + 1); 2495 2496 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2497 mat->column_indices->assign(a->j, a->j + nnz); 2498 2499 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2500 if (a->a) mat->values->assign(a->a, a->a + nnz); 2501 2502 /* assign the pointer */ 2503 matstruct->mat = mat; 2504 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2505 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2506 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2507 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2508 PetscCallCUSPARSE(stat); 2509 } 2510 #endif 2511 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2512 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2513 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2514 #else 2515 CsrMatrix *mat = new CsrMatrix; 2516 mat->num_rows = m; 2517 mat->num_cols = A->cmap->n; 2518 mat->num_entries = nnz; 2519 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2520 mat->row_offsets->assign(ii, ii + m + 1); 2521 2522 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2523 mat->column_indices->assign(a->j, a->j + nnz); 2524 2525 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2526 if (a->a) mat->values->assign(a->a, a->a + nnz); 2527 2528 cusparseHybMat_t hybMat; 2529 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2530 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2531 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2532 PetscCallCUSPARSE(stat); 2533 /* assign the pointer */ 2534 matstruct->mat = hybMat; 2535 2536 if (mat) { 2537 if (mat->values) delete (THRUSTARRAY *)mat->values; 2538 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2539 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2540 delete (CsrMatrix *)mat; 2541 } 2542 #endif 2543 } 2544 2545 /* assign the compressed row indices */ 2546 if (a->compressedrow.use) { 2547 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2548 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2549 matstruct->cprowIndices->assign(ridx, ridx + m); 2550 tmp = m; 2551 } else { 2552 cusparsestruct->workVector = NULL; 2553 matstruct->cprowIndices = NULL; 2554 tmp = 0; 2555 } 2556 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2557 2558 /* assign the pointer */ 2559 cusparsestruct->mat = matstruct; 2560 } catch (char *ex) { 2561 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2562 } 2563 PetscCallCUDA(WaitForCUDA()); 2564 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2565 cusparsestruct->nonzerostate = A->nonzerostate; 2566 } 2567 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2568 } 2569 PetscFunctionReturn(PETSC_SUCCESS); 2570 } 2571 2572 struct VecCUDAPlusEquals { 2573 template <typename Tuple> 2574 __host__ __device__ void operator()(Tuple t) 2575 { 2576 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2577 } 2578 }; 2579 2580 struct VecCUDAEquals { 2581 template <typename Tuple> 2582 __host__ __device__ void operator()(Tuple t) 2583 { 2584 thrust::get<1>(t) = thrust::get<0>(t); 2585 } 2586 }; 2587 2588 struct VecCUDAEqualsReverse { 2589 template <typename Tuple> 2590 __host__ __device__ void operator()(Tuple t) 2591 { 2592 thrust::get<0>(t) = thrust::get<1>(t); 2593 } 2594 }; 2595 2596 struct MatMatCusparse { 2597 PetscBool cisdense; 2598 PetscScalar *Bt; 2599 Mat X; 2600 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2601 PetscLogDouble flops; 2602 CsrMatrix *Bcsr; 2603 2604 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2605 cusparseSpMatDescr_t matSpBDescr; 2606 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2607 cusparseDnMatDescr_t matBDescr; 2608 cusparseDnMatDescr_t matCDescr; 2609 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2610 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2611 void *dBuffer4; 2612 void *dBuffer5; 2613 #endif 2614 size_t mmBufferSize; 2615 void *mmBuffer; 2616 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2617 cusparseSpGEMMDescr_t spgemmDesc; 2618 #endif 2619 }; 2620 2621 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2622 { 2623 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2624 2625 PetscFunctionBegin; 2626 PetscCallCUDA(cudaFree(mmdata->Bt)); 2627 delete mmdata->Bcsr; 2628 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2629 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2630 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2631 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2632 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2633 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2634 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2635 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2636 #endif 2637 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2638 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2639 #endif 2640 PetscCall(MatDestroy(&mmdata->X)); 2641 PetscCall(PetscFree(data)); 2642 PetscFunctionReturn(PETSC_SUCCESS); 2643 } 2644 2645 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2646 2647 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2648 { 2649 Mat_Product *product = C->product; 2650 Mat A, B; 2651 PetscInt m, n, blda, clda; 2652 PetscBool flg, biscuda; 2653 Mat_SeqAIJCUSPARSE *cusp; 2654 cusparseStatus_t stat; 2655 cusparseOperation_t opA; 2656 const PetscScalar *barray; 2657 PetscScalar *carray; 2658 MatMatCusparse *mmdata; 2659 Mat_SeqAIJCUSPARSEMultStruct *mat; 2660 CsrMatrix *csrmat; 2661 2662 PetscFunctionBegin; 2663 MatCheckProduct(C, 1); 2664 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2665 mmdata = (MatMatCusparse *)product->data; 2666 A = product->A; 2667 B = product->B; 2668 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2669 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2670 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2671 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2672 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2673 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2674 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2675 switch (product->type) { 2676 case MATPRODUCT_AB: 2677 case MATPRODUCT_PtAP: 2678 mat = cusp->mat; 2679 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2680 m = A->rmap->n; 2681 n = B->cmap->n; 2682 break; 2683 case MATPRODUCT_AtB: 2684 if (!A->form_explicit_transpose) { 2685 mat = cusp->mat; 2686 opA = CUSPARSE_OPERATION_TRANSPOSE; 2687 } else { 2688 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2689 mat = cusp->matTranspose; 2690 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2691 } 2692 m = A->cmap->n; 2693 n = B->cmap->n; 2694 break; 2695 case MATPRODUCT_ABt: 2696 case MATPRODUCT_RARt: 2697 mat = cusp->mat; 2698 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2699 m = A->rmap->n; 2700 n = B->rmap->n; 2701 break; 2702 default: 2703 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2704 } 2705 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2706 csrmat = (CsrMatrix *)mat->mat; 2707 /* if the user passed a CPU matrix, copy the data to the GPU */ 2708 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2709 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2710 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2711 2712 PetscCall(MatDenseGetLDA(B, &blda)); 2713 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2714 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2715 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2716 } else { 2717 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2718 PetscCall(MatDenseGetLDA(C, &clda)); 2719 } 2720 2721 PetscCall(PetscLogGpuTimeBegin()); 2722 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2723 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2724 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2725 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2726 #else 2727 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2728 #endif 2729 2730 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2731 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2732 size_t mmBufferSize; 2733 if (mmdata->initialized && mmdata->Blda != blda) { 2734 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2735 mmdata->matBDescr = NULL; 2736 } 2737 if (!mmdata->matBDescr) { 2738 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2739 mmdata->Blda = blda; 2740 } 2741 2742 if (mmdata->initialized && mmdata->Clda != clda) { 2743 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2744 mmdata->matCDescr = NULL; 2745 } 2746 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2747 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2748 mmdata->Clda = clda; 2749 } 2750 2751 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2752 if (matADescr) { 2753 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2754 matADescr = NULL; 2755 } 2756 #endif 2757 2758 if (!matADescr) { 2759 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2760 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2761 PetscCallCUSPARSE(stat); 2762 } 2763 2764 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2765 2766 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2767 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2768 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2769 mmdata->mmBufferSize = mmBufferSize; 2770 } 2771 2772 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0 2773 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2774 #endif 2775 2776 mmdata->initialized = PETSC_TRUE; 2777 } else { 2778 /* to be safe, always update pointers of the mats */ 2779 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2780 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2781 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2782 } 2783 2784 /* do cusparseSpMM, which supports transpose on B */ 2785 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2786 #else 2787 PetscInt k; 2788 /* cusparseXcsrmm does not support transpose on B */ 2789 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2790 cublasHandle_t cublasv2handle; 2791 cublasStatus_t cerr; 2792 2793 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2794 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2795 PetscCallCUBLAS(cerr); 2796 blda = B->cmap->n; 2797 k = B->cmap->n; 2798 } else { 2799 k = B->rmap->n; 2800 } 2801 2802 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2803 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2804 PetscCallCUSPARSE(stat); 2805 #endif 2806 PetscCall(PetscLogGpuTimeEnd()); 2807 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2808 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2809 if (product->type == MATPRODUCT_RARt) { 2810 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2811 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2812 } else if (product->type == MATPRODUCT_PtAP) { 2813 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2814 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2815 } else { 2816 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2817 } 2818 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2819 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2820 PetscFunctionReturn(PETSC_SUCCESS); 2821 } 2822 2823 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2824 { 2825 Mat_Product *product = C->product; 2826 Mat A, B; 2827 PetscInt m, n; 2828 PetscBool cisdense, flg; 2829 MatMatCusparse *mmdata; 2830 Mat_SeqAIJCUSPARSE *cusp; 2831 2832 PetscFunctionBegin; 2833 MatCheckProduct(C, 1); 2834 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2835 A = product->A; 2836 B = product->B; 2837 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2838 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2839 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2840 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2841 switch (product->type) { 2842 case MATPRODUCT_AB: 2843 m = A->rmap->n; 2844 n = B->cmap->n; 2845 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2846 break; 2847 case MATPRODUCT_AtB: 2848 m = A->cmap->n; 2849 n = B->cmap->n; 2850 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 2851 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2852 break; 2853 case MATPRODUCT_ABt: 2854 m = A->rmap->n; 2855 n = B->rmap->n; 2856 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 2857 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2858 break; 2859 case MATPRODUCT_PtAP: 2860 m = B->cmap->n; 2861 n = B->cmap->n; 2862 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 2863 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2864 break; 2865 case MATPRODUCT_RARt: 2866 m = B->rmap->n; 2867 n = B->rmap->n; 2868 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 2869 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2870 break; 2871 default: 2872 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2873 } 2874 PetscCall(MatSetSizes(C, m, n, m, n)); 2875 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2876 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2877 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2878 2879 /* product data */ 2880 PetscCall(PetscNew(&mmdata)); 2881 mmdata->cisdense = cisdense; 2882 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2883 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2884 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2885 #endif 2886 /* for these products we need intermediate storage */ 2887 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2888 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2889 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2890 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2891 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2892 } else { 2893 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2894 } 2895 } 2896 C->product->data = mmdata; 2897 C->product->destroy = MatDestroy_MatMatCusparse; 2898 2899 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2900 PetscFunctionReturn(PETSC_SUCCESS); 2901 } 2902 2903 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2904 { 2905 Mat_Product *product = C->product; 2906 Mat A, B; 2907 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2908 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2909 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2910 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2911 PetscBool flg; 2912 cusparseStatus_t stat; 2913 MatProductType ptype; 2914 MatMatCusparse *mmdata; 2915 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2916 cusparseSpMatDescr_t BmatSpDescr; 2917 #endif 2918 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2919 2920 PetscFunctionBegin; 2921 MatCheckProduct(C, 1); 2922 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2923 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2924 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2925 mmdata = (MatMatCusparse *)C->product->data; 2926 A = product->A; 2927 B = product->B; 2928 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2929 mmdata->reusesym = PETSC_FALSE; 2930 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2931 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2932 Cmat = Ccusp->mat; 2933 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2934 Ccsr = (CsrMatrix *)Cmat->mat; 2935 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2936 goto finalize; 2937 } 2938 if (!c->nz) goto finalize; 2939 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2940 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2941 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2942 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2943 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2944 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2945 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2946 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2947 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2948 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2949 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2950 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2951 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2952 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2953 2954 ptype = product->type; 2955 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2956 ptype = MATPRODUCT_AB; 2957 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2958 } 2959 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2960 ptype = MATPRODUCT_AB; 2961 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2962 } 2963 switch (ptype) { 2964 case MATPRODUCT_AB: 2965 Amat = Acusp->mat; 2966 Bmat = Bcusp->mat; 2967 break; 2968 case MATPRODUCT_AtB: 2969 Amat = Acusp->matTranspose; 2970 Bmat = Bcusp->mat; 2971 break; 2972 case MATPRODUCT_ABt: 2973 Amat = Acusp->mat; 2974 Bmat = Bcusp->matTranspose; 2975 break; 2976 default: 2977 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2978 } 2979 Cmat = Ccusp->mat; 2980 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2981 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2982 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2983 Acsr = (CsrMatrix *)Amat->mat; 2984 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2985 Ccsr = (CsrMatrix *)Cmat->mat; 2986 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2987 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2988 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2989 PetscCall(PetscLogGpuTimeBegin()); 2990 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2991 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2992 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2993 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2994 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2995 PetscCallCUSPARSE(stat); 2996 #else 2997 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2998 PetscCallCUSPARSE(stat); 2999 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3000 PetscCallCUSPARSE(stat); 3001 #endif 3002 #else 3003 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3004 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3005 PetscCallCUSPARSE(stat); 3006 #endif 3007 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3008 PetscCallCUDA(WaitForCUDA()); 3009 PetscCall(PetscLogGpuTimeEnd()); 3010 C->offloadmask = PETSC_OFFLOAD_GPU; 3011 finalize: 3012 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3013 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3014 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3015 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3016 c->reallocs = 0; 3017 C->info.mallocs += 0; 3018 C->info.nz_unneeded = 0; 3019 C->assembled = C->was_assembled = PETSC_TRUE; 3020 C->num_ass++; 3021 PetscFunctionReturn(PETSC_SUCCESS); 3022 } 3023 3024 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3025 { 3026 Mat_Product *product = C->product; 3027 Mat A, B; 3028 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3029 Mat_SeqAIJ *a, *b, *c; 3030 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3031 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3032 PetscInt i, j, m, n, k; 3033 PetscBool flg; 3034 cusparseStatus_t stat; 3035 MatProductType ptype; 3036 MatMatCusparse *mmdata; 3037 PetscLogDouble flops; 3038 PetscBool biscompressed, ciscompressed; 3039 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3040 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3041 cusparseSpMatDescr_t BmatSpDescr; 3042 #else 3043 int cnz; 3044 #endif 3045 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3046 3047 PetscFunctionBegin; 3048 MatCheckProduct(C, 1); 3049 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3050 A = product->A; 3051 B = product->B; 3052 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3053 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3054 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3055 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3056 a = (Mat_SeqAIJ *)A->data; 3057 b = (Mat_SeqAIJ *)B->data; 3058 /* product data */ 3059 PetscCall(PetscNew(&mmdata)); 3060 C->product->data = mmdata; 3061 C->product->destroy = MatDestroy_MatMatCusparse; 3062 3063 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3064 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3065 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3066 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3067 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3068 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3069 3070 ptype = product->type; 3071 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3072 ptype = MATPRODUCT_AB; 3073 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3074 } 3075 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3076 ptype = MATPRODUCT_AB; 3077 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3078 } 3079 biscompressed = PETSC_FALSE; 3080 ciscompressed = PETSC_FALSE; 3081 switch (ptype) { 3082 case MATPRODUCT_AB: 3083 m = A->rmap->n; 3084 n = B->cmap->n; 3085 k = A->cmap->n; 3086 Amat = Acusp->mat; 3087 Bmat = Bcusp->mat; 3088 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3089 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3090 break; 3091 case MATPRODUCT_AtB: 3092 m = A->cmap->n; 3093 n = B->cmap->n; 3094 k = A->rmap->n; 3095 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3096 Amat = Acusp->matTranspose; 3097 Bmat = Bcusp->mat; 3098 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3099 break; 3100 case MATPRODUCT_ABt: 3101 m = A->rmap->n; 3102 n = B->rmap->n; 3103 k = A->cmap->n; 3104 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3105 Amat = Acusp->mat; 3106 Bmat = Bcusp->matTranspose; 3107 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3108 break; 3109 default: 3110 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3111 } 3112 3113 /* create cusparse matrix */ 3114 PetscCall(MatSetSizes(C, m, n, m, n)); 3115 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3116 c = (Mat_SeqAIJ *)C->data; 3117 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3118 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3119 Ccsr = new CsrMatrix; 3120 3121 c->compressedrow.use = ciscompressed; 3122 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3123 c->compressedrow.nrows = a->compressedrow.nrows; 3124 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3125 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3126 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3127 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3128 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3129 } else { 3130 c->compressedrow.nrows = 0; 3131 c->compressedrow.i = NULL; 3132 c->compressedrow.rindex = NULL; 3133 Ccusp->workVector = NULL; 3134 Cmat->cprowIndices = NULL; 3135 } 3136 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3137 Ccusp->mat = Cmat; 3138 Ccusp->mat->mat = Ccsr; 3139 Ccsr->num_rows = Ccusp->nrows; 3140 Ccsr->num_cols = n; 3141 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3142 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3143 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3144 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3145 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3146 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3147 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3148 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3149 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3150 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3151 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3152 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3153 c->nz = 0; 3154 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3155 Ccsr->values = new THRUSTARRAY(c->nz); 3156 goto finalizesym; 3157 } 3158 3159 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3160 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3161 Acsr = (CsrMatrix *)Amat->mat; 3162 if (!biscompressed) { 3163 Bcsr = (CsrMatrix *)Bmat->mat; 3164 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3165 BmatSpDescr = Bmat->matDescr; 3166 #endif 3167 } else { /* we need to use row offsets for the full matrix */ 3168 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3169 Bcsr = new CsrMatrix; 3170 Bcsr->num_rows = B->rmap->n; 3171 Bcsr->num_cols = cBcsr->num_cols; 3172 Bcsr->num_entries = cBcsr->num_entries; 3173 Bcsr->column_indices = cBcsr->column_indices; 3174 Bcsr->values = cBcsr->values; 3175 if (!Bcusp->rowoffsets_gpu) { 3176 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3177 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3178 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3179 } 3180 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3181 mmdata->Bcsr = Bcsr; 3182 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3183 if (Bcsr->num_rows && Bcsr->num_cols) { 3184 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3185 PetscCallCUSPARSE(stat); 3186 } 3187 BmatSpDescr = mmdata->matSpBDescr; 3188 #endif 3189 } 3190 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3191 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3192 /* precompute flops count */ 3193 if (ptype == MATPRODUCT_AB) { 3194 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3195 const PetscInt st = a->i[i]; 3196 const PetscInt en = a->i[i + 1]; 3197 for (j = st; j < en; j++) { 3198 const PetscInt brow = a->j[j]; 3199 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3200 } 3201 } 3202 } else if (ptype == MATPRODUCT_AtB) { 3203 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3204 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3205 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3206 flops += (2. * anzi) * bnzi; 3207 } 3208 } else { /* TODO */ 3209 flops = 0.; 3210 } 3211 3212 mmdata->flops = flops; 3213 PetscCall(PetscLogGpuTimeBegin()); 3214 3215 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3216 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3217 // cuda-12.2 requires non-null csrRowOffsets 3218 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3219 PetscCallCUSPARSE(stat); 3220 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3221 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3222 { 3223 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3224 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3225 */ 3226 void *dBuffer1 = NULL; 3227 void *dBuffer2 = NULL; 3228 void *dBuffer3 = NULL; 3229 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3230 size_t bufferSize1 = 0; 3231 size_t bufferSize2 = 0; 3232 size_t bufferSize3 = 0; 3233 size_t bufferSize4 = 0; 3234 size_t bufferSize5 = 0; 3235 3236 /* ask bufferSize1 bytes for external memory */ 3237 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3238 PetscCallCUSPARSE(stat); 3239 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3240 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3241 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3242 PetscCallCUSPARSE(stat); 3243 3244 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3245 PetscCallCUSPARSE(stat); 3246 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3247 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3248 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3249 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3250 PetscCallCUSPARSE(stat); 3251 PetscCallCUDA(cudaFree(dBuffer1)); 3252 PetscCallCUDA(cudaFree(dBuffer2)); 3253 3254 /* get matrix C non-zero entries C_nnz1 */ 3255 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3256 c->nz = (PetscInt)C_nnz1; 3257 /* allocate matrix C */ 3258 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3259 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3260 Ccsr->values = new THRUSTARRAY(c->nz); 3261 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3262 /* update matC with the new pointers */ 3263 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3264 PetscCallCUSPARSE(stat); 3265 3266 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3267 PetscCallCUSPARSE(stat); 3268 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3269 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3270 PetscCallCUSPARSE(stat); 3271 PetscCallCUDA(cudaFree(dBuffer3)); 3272 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3273 PetscCallCUSPARSE(stat); 3274 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3275 } 3276 #else 3277 size_t bufSize2; 3278 /* ask bufferSize bytes for external memory */ 3279 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3280 PetscCallCUSPARSE(stat); 3281 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3282 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3283 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3284 PetscCallCUSPARSE(stat); 3285 /* ask bufferSize again bytes for external memory */ 3286 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3287 PetscCallCUSPARSE(stat); 3288 /* The CUSPARSE documentation is not clear, nor the API 3289 We need both buffers to perform the operations properly! 3290 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3291 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3292 is stored in the descriptor! What a messy API... */ 3293 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3294 /* compute the intermediate product of A * B */ 3295 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3296 PetscCallCUSPARSE(stat); 3297 /* get matrix C non-zero entries C_nnz1 */ 3298 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3299 c->nz = (PetscInt)C_nnz1; 3300 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3301 mmdata->mmBufferSize / 1024)); 3302 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3303 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3304 Ccsr->values = new THRUSTARRAY(c->nz); 3305 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3306 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3307 PetscCallCUSPARSE(stat); 3308 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3309 PetscCallCUSPARSE(stat); 3310 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3311 #else 3312 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3313 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3314 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3315 PetscCallCUSPARSE(stat); 3316 c->nz = cnz; 3317 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3318 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3319 Ccsr->values = new THRUSTARRAY(c->nz); 3320 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3321 3322 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3323 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3324 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3325 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3326 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3327 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3328 PetscCallCUSPARSE(stat); 3329 #endif 3330 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3331 PetscCall(PetscLogGpuTimeEnd()); 3332 finalizesym: 3333 c->free_a = PETSC_TRUE; 3334 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3335 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3336 c->free_ij = PETSC_TRUE; 3337 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3338 PetscInt *d_i = c->i; 3339 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3340 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3341 ii = *Ccsr->row_offsets; 3342 jj = *Ccsr->column_indices; 3343 if (ciscompressed) d_i = c->compressedrow.i; 3344 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3345 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3346 } else { 3347 PetscInt *d_i = c->i; 3348 if (ciscompressed) d_i = c->compressedrow.i; 3349 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3350 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3351 } 3352 if (ciscompressed) { /* need to expand host row offsets */ 3353 PetscInt r = 0; 3354 c->i[0] = 0; 3355 for (k = 0; k < c->compressedrow.nrows; k++) { 3356 const PetscInt next = c->compressedrow.rindex[k]; 3357 const PetscInt old = c->compressedrow.i[k]; 3358 for (; r < next; r++) c->i[r + 1] = old; 3359 } 3360 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3361 } 3362 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3363 PetscCall(PetscMalloc1(m, &c->ilen)); 3364 PetscCall(PetscMalloc1(m, &c->imax)); 3365 c->maxnz = c->nz; 3366 c->nonzerorowcnt = 0; 3367 c->rmax = 0; 3368 for (k = 0; k < m; k++) { 3369 const PetscInt nn = c->i[k + 1] - c->i[k]; 3370 c->ilen[k] = c->imax[k] = nn; 3371 c->nonzerorowcnt += (PetscInt)!!nn; 3372 c->rmax = PetscMax(c->rmax, nn); 3373 } 3374 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3375 PetscCall(PetscMalloc1(c->nz, &c->a)); 3376 Ccsr->num_entries = c->nz; 3377 3378 C->nonzerostate++; 3379 PetscCall(PetscLayoutSetUp(C->rmap)); 3380 PetscCall(PetscLayoutSetUp(C->cmap)); 3381 Ccusp->nonzerostate = C->nonzerostate; 3382 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3383 C->preallocated = PETSC_TRUE; 3384 C->assembled = PETSC_FALSE; 3385 C->was_assembled = PETSC_FALSE; 3386 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3387 mmdata->reusesym = PETSC_TRUE; 3388 C->offloadmask = PETSC_OFFLOAD_GPU; 3389 } 3390 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3391 PetscFunctionReturn(PETSC_SUCCESS); 3392 } 3393 3394 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3395 3396 /* handles sparse or dense B */ 3397 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3398 { 3399 Mat_Product *product = mat->product; 3400 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3401 3402 PetscFunctionBegin; 3403 MatCheckProduct(mat, 1); 3404 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3405 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3406 if (product->type == MATPRODUCT_ABC) { 3407 Ciscusp = PETSC_FALSE; 3408 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3409 } 3410 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3411 PetscBool usecpu = PETSC_FALSE; 3412 switch (product->type) { 3413 case MATPRODUCT_AB: 3414 if (product->api_user) { 3415 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3416 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3417 PetscOptionsEnd(); 3418 } else { 3419 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3420 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3421 PetscOptionsEnd(); 3422 } 3423 break; 3424 case MATPRODUCT_AtB: 3425 if (product->api_user) { 3426 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3427 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3428 PetscOptionsEnd(); 3429 } else { 3430 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3431 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3432 PetscOptionsEnd(); 3433 } 3434 break; 3435 case MATPRODUCT_PtAP: 3436 if (product->api_user) { 3437 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3438 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3439 PetscOptionsEnd(); 3440 } else { 3441 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3442 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3443 PetscOptionsEnd(); 3444 } 3445 break; 3446 case MATPRODUCT_RARt: 3447 if (product->api_user) { 3448 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3449 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3450 PetscOptionsEnd(); 3451 } else { 3452 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3453 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3454 PetscOptionsEnd(); 3455 } 3456 break; 3457 case MATPRODUCT_ABC: 3458 if (product->api_user) { 3459 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3460 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3461 PetscOptionsEnd(); 3462 } else { 3463 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3464 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3465 PetscOptionsEnd(); 3466 } 3467 break; 3468 default: 3469 break; 3470 } 3471 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3472 } 3473 /* dispatch */ 3474 if (isdense) { 3475 switch (product->type) { 3476 case MATPRODUCT_AB: 3477 case MATPRODUCT_AtB: 3478 case MATPRODUCT_ABt: 3479 case MATPRODUCT_PtAP: 3480 case MATPRODUCT_RARt: 3481 if (product->A->boundtocpu) { 3482 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3483 } else { 3484 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3485 } 3486 break; 3487 case MATPRODUCT_ABC: 3488 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3489 break; 3490 default: 3491 break; 3492 } 3493 } else if (Biscusp && Ciscusp) { 3494 switch (product->type) { 3495 case MATPRODUCT_AB: 3496 case MATPRODUCT_AtB: 3497 case MATPRODUCT_ABt: 3498 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3499 break; 3500 case MATPRODUCT_PtAP: 3501 case MATPRODUCT_RARt: 3502 case MATPRODUCT_ABC: 3503 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3504 break; 3505 default: 3506 break; 3507 } 3508 } else { /* fallback for AIJ */ 3509 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3510 } 3511 PetscFunctionReturn(PETSC_SUCCESS); 3512 } 3513 3514 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3515 { 3516 PetscFunctionBegin; 3517 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3518 PetscFunctionReturn(PETSC_SUCCESS); 3519 } 3520 3521 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3522 { 3523 PetscFunctionBegin; 3524 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3525 PetscFunctionReturn(PETSC_SUCCESS); 3526 } 3527 3528 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3529 { 3530 PetscFunctionBegin; 3531 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3532 PetscFunctionReturn(PETSC_SUCCESS); 3533 } 3534 3535 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3536 { 3537 PetscFunctionBegin; 3538 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3539 PetscFunctionReturn(PETSC_SUCCESS); 3540 } 3541 3542 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3543 { 3544 PetscFunctionBegin; 3545 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3546 PetscFunctionReturn(PETSC_SUCCESS); 3547 } 3548 3549 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3550 { 3551 int i = blockIdx.x * blockDim.x + threadIdx.x; 3552 if (i < n) y[idx[i]] += x[i]; 3553 } 3554 3555 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3556 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3557 { 3558 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3559 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3560 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3561 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3562 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3563 PetscBool compressed; 3564 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3565 PetscInt nx, ny; 3566 #endif 3567 3568 PetscFunctionBegin; 3569 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3570 if (!a->nz) { 3571 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3572 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3573 PetscFunctionReturn(PETSC_SUCCESS); 3574 } 3575 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3576 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3577 if (!trans) { 3578 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3579 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3580 } else { 3581 if (herm || !A->form_explicit_transpose) { 3582 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3583 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3584 } else { 3585 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3586 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3587 } 3588 } 3589 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3590 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3591 3592 try { 3593 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3594 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3595 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3596 3597 PetscCall(PetscLogGpuTimeBegin()); 3598 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3599 /* z = A x + beta y. 3600 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3601 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3602 */ 3603 xptr = xarray; 3604 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3605 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3606 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3607 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3608 allocated to accommodate different uses. So we get the length info directly from mat. 3609 */ 3610 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3611 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3612 nx = mat->num_cols; // since y = Ax 3613 ny = mat->num_rows; 3614 } 3615 #endif 3616 } else { 3617 /* z = A^T x + beta y 3618 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3619 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3620 */ 3621 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3622 dptr = zarray; 3623 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3624 if (compressed) { /* Scatter x to work vector */ 3625 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3626 3627 thrust::for_each( 3628 #if PetscDefined(HAVE_THRUST_ASYNC) 3629 thrust::cuda::par.on(PetscDefaultCudaStream), 3630 #endif 3631 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3632 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3633 } 3634 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3635 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3636 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3637 nx = mat->num_rows; // since y = A^T x 3638 ny = mat->num_cols; 3639 } 3640 #endif 3641 } 3642 3643 /* csr_spmv does y = alpha op(A) x + beta y */ 3644 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3645 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3646 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3647 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3648 #else 3649 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3650 #endif 3651 3652 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3653 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3654 if (!matDescr) { 3655 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3656 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3657 } 3658 #endif 3659 3660 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3661 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3662 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3663 PetscCallCUSPARSE( 3664 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3665 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3666 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3667 PetscCallCUSPARSE( 3668 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3669 #endif 3670 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3671 } else { 3672 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3673 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3674 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3675 } 3676 3677 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3678 #else 3679 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3680 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3681 #endif 3682 } else { 3683 if (cusparsestruct->nrows) { 3684 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3685 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3686 #else 3687 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3688 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3689 #endif 3690 } 3691 } 3692 PetscCall(PetscLogGpuTimeEnd()); 3693 3694 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3695 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3696 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3697 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3698 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3699 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3700 } 3701 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3702 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3703 } 3704 3705 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3706 if (compressed) { 3707 PetscCall(PetscLogGpuTimeBegin()); 3708 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3709 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3710 PetscCall(PetscLogGpuTimeEnd()); 3711 } 3712 } else { 3713 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3714 } 3715 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3716 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3717 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3718 } catch (char *ex) { 3719 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3720 } 3721 if (yy) { 3722 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3723 } else { 3724 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3725 } 3726 PetscFunctionReturn(PETSC_SUCCESS); 3727 } 3728 3729 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3730 { 3731 PetscFunctionBegin; 3732 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3733 PetscFunctionReturn(PETSC_SUCCESS); 3734 } 3735 3736 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3737 { 3738 PetscFunctionBegin; 3739 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3740 PetscFunctionReturn(PETSC_SUCCESS); 3741 } 3742 3743 /*@ 3744 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs 3745 3746 Collective 3747 3748 Input Parameters: 3749 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3750 . m - number of rows 3751 . n - number of columns 3752 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3753 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3754 3755 Output Parameter: 3756 . A - the matrix 3757 3758 Level: intermediate 3759 3760 Notes: 3761 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3762 calculations. For good matrix assembly performance the user should preallocate the matrix 3763 storage by setting the parameter `nz` (or the array `nnz`). 3764 3765 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3766 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3767 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3768 3769 The AIJ format, also called 3770 compressed row storage, is fully compatible with standard Fortran 3771 storage. That is, the stored row and column indices can begin at 3772 either one (as in Fortran) or zero. 3773 3774 Specify the preallocated storage with either nz or nnz (not both). 3775 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3776 allocation. 3777 3778 When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()` 3779 3780 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`, 3781 `MatSetPreallocationCOO()`, `MatSetValuesCOO()` 3782 @*/ 3783 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3784 { 3785 PetscFunctionBegin; 3786 PetscCall(MatCreate(comm, A)); 3787 PetscCall(MatSetSizes(*A, m, n, m, n)); 3788 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3789 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3790 PetscFunctionReturn(PETSC_SUCCESS); 3791 } 3792 3793 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3794 { 3795 PetscFunctionBegin; 3796 if (A->factortype == MAT_FACTOR_NONE) { 3797 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3798 } else { 3799 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3800 } 3801 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3802 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3803 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3804 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3805 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3806 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3807 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3808 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3809 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3810 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3811 PetscCall(MatDestroy_SeqAIJ(A)); 3812 PetscFunctionReturn(PETSC_SUCCESS); 3813 } 3814 3815 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3816 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3817 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3818 { 3819 PetscFunctionBegin; 3820 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3821 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3822 PetscFunctionReturn(PETSC_SUCCESS); 3823 } 3824 3825 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3826 { 3827 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3828 Mat_SeqAIJCUSPARSE *cy; 3829 Mat_SeqAIJCUSPARSE *cx; 3830 PetscScalar *ay; 3831 const PetscScalar *ax; 3832 CsrMatrix *csry, *csrx; 3833 3834 PetscFunctionBegin; 3835 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3836 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3837 if (X->ops->axpy != Y->ops->axpy) { 3838 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3839 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3840 PetscFunctionReturn(PETSC_SUCCESS); 3841 } 3842 /* if we are here, it means both matrices are bound to GPU */ 3843 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3844 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3845 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3846 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3847 csry = (CsrMatrix *)cy->mat->mat; 3848 csrx = (CsrMatrix *)cx->mat->mat; 3849 /* see if we can turn this into a cublas axpy */ 3850 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3851 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3852 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3853 if (eq) str = SAME_NONZERO_PATTERN; 3854 } 3855 /* spgeam is buggy with one column */ 3856 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3857 3858 if (str == SUBSET_NONZERO_PATTERN) { 3859 PetscScalar b = 1.0; 3860 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3861 size_t bufferSize; 3862 void *buffer; 3863 #endif 3864 3865 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3866 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3867 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3868 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3869 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3870 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3871 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3872 PetscCall(PetscLogGpuTimeBegin()); 3873 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3874 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3875 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3876 PetscCall(PetscLogGpuTimeEnd()); 3877 PetscCallCUDA(cudaFree(buffer)); 3878 #else 3879 PetscCall(PetscLogGpuTimeBegin()); 3880 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3881 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3882 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3883 PetscCall(PetscLogGpuTimeEnd()); 3884 #endif 3885 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3886 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3887 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3888 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3889 } else if (str == SAME_NONZERO_PATTERN) { 3890 cublasHandle_t cublasv2handle; 3891 PetscBLASInt one = 1, bnz = 1; 3892 3893 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3894 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3895 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3896 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3897 PetscCall(PetscLogGpuTimeBegin()); 3898 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3899 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3900 PetscCall(PetscLogGpuTimeEnd()); 3901 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3902 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3903 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3904 } else { 3905 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3906 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3907 } 3908 PetscFunctionReturn(PETSC_SUCCESS); 3909 } 3910 3911 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3912 { 3913 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3914 PetscScalar *ay; 3915 cublasHandle_t cublasv2handle; 3916 PetscBLASInt one = 1, bnz = 1; 3917 3918 PetscFunctionBegin; 3919 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3920 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3921 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3922 PetscCall(PetscLogGpuTimeBegin()); 3923 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3924 PetscCall(PetscLogGpuFlops(bnz)); 3925 PetscCall(PetscLogGpuTimeEnd()); 3926 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3927 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3928 PetscFunctionReturn(PETSC_SUCCESS); 3929 } 3930 3931 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3932 { 3933 PetscBool both = PETSC_FALSE; 3934 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3935 3936 PetscFunctionBegin; 3937 if (A->factortype == MAT_FACTOR_NONE) { 3938 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3939 if (spptr->mat) { 3940 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3941 if (matrix->values) { 3942 both = PETSC_TRUE; 3943 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3944 } 3945 } 3946 if (spptr->matTranspose) { 3947 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3948 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3949 } 3950 } 3951 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3952 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3953 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3954 else A->offloadmask = PETSC_OFFLOAD_CPU; 3955 PetscFunctionReturn(PETSC_SUCCESS); 3956 } 3957 3958 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m) 3959 { 3960 PetscFunctionBegin; 3961 *m = PETSC_MEMTYPE_CUDA; 3962 PetscFunctionReturn(PETSC_SUCCESS); 3963 } 3964 3965 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3966 { 3967 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3968 3969 PetscFunctionBegin; 3970 if (A->factortype != MAT_FACTOR_NONE) { 3971 A->boundtocpu = flg; 3972 PetscFunctionReturn(PETSC_SUCCESS); 3973 } 3974 if (flg) { 3975 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3976 3977 A->ops->scale = MatScale_SeqAIJ; 3978 A->ops->axpy = MatAXPY_SeqAIJ; 3979 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3980 A->ops->mult = MatMult_SeqAIJ; 3981 A->ops->multadd = MatMultAdd_SeqAIJ; 3982 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3983 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3984 A->ops->multhermitiantranspose = NULL; 3985 A->ops->multhermitiantransposeadd = NULL; 3986 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3987 A->ops->getcurrentmemtype = NULL; 3988 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3989 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3990 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3991 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3992 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3993 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3994 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3995 } else { 3996 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3997 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3998 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3999 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4000 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4001 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4002 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4003 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4004 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4005 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4006 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4007 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4008 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4009 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4010 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4011 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4012 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4013 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4014 4015 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4016 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4017 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4018 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4019 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4020 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4021 } 4022 A->boundtocpu = flg; 4023 if (flg && a->inode.size_csr) { 4024 a->inode.use = PETSC_TRUE; 4025 } else { 4026 a->inode.use = PETSC_FALSE; 4027 } 4028 PetscFunctionReturn(PETSC_SUCCESS); 4029 } 4030 4031 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4032 { 4033 Mat B; 4034 4035 PetscFunctionBegin; 4036 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4037 if (reuse == MAT_INITIAL_MATRIX) { 4038 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4039 } else if (reuse == MAT_REUSE_MATRIX) { 4040 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4041 } 4042 B = *newmat; 4043 4044 PetscCall(PetscFree(B->defaultvectype)); 4045 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4046 4047 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4048 if (B->factortype == MAT_FACTOR_NONE) { 4049 Mat_SeqAIJCUSPARSE *spptr; 4050 PetscCall(PetscNew(&spptr)); 4051 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4052 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4053 spptr->format = MAT_CUSPARSE_CSR; 4054 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4055 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4056 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4057 #else 4058 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4059 #endif 4060 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4061 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4062 #endif 4063 B->spptr = spptr; 4064 } else { 4065 Mat_SeqAIJCUSPARSETriFactors *spptr; 4066 4067 PetscCall(PetscNew(&spptr)); 4068 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4069 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4070 B->spptr = spptr; 4071 } 4072 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4073 } 4074 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4075 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4076 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4077 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4078 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4079 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4080 B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4081 4082 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4083 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4084 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4085 #if defined(PETSC_HAVE_HYPRE) 4086 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4087 #endif 4088 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4089 PetscFunctionReturn(PETSC_SUCCESS); 4090 } 4091 4092 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4093 { 4094 PetscFunctionBegin; 4095 PetscCall(MatCreate_SeqAIJ(B)); 4096 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4097 PetscFunctionReturn(PETSC_SUCCESS); 4098 } 4099 4100 /*MC 4101 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs. 4102 4103 Options Database Keys: 4104 + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4105 . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4106 Other options include ell (ellpack) or hyb (hybrid). 4107 . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4108 - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU 4109 4110 Level: beginner 4111 4112 Notes: 4113 These matrices can be in either CSR, ELL, or HYB format. 4114 4115 All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library. 4116 4117 Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens 4118 if some integer values passed in do not fit in `int`. 4119 4120 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4121 M*/ 4122 4123 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4124 { 4125 PetscFunctionBegin; 4126 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4127 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4128 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4129 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4130 PetscFunctionReturn(PETSC_SUCCESS); 4131 } 4132 4133 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4134 { 4135 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4136 4137 PetscFunctionBegin; 4138 if (cusp) { 4139 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4140 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4141 delete cusp->workVector; 4142 delete cusp->rowoffsets_gpu; 4143 delete cusp->csr2csc_i; 4144 delete cusp->coords; 4145 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4146 PetscCall(PetscFree(mat->spptr)); 4147 } 4148 PetscFunctionReturn(PETSC_SUCCESS); 4149 } 4150 4151 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4152 { 4153 PetscFunctionBegin; 4154 if (*mat) { 4155 delete (*mat)->values; 4156 delete (*mat)->column_indices; 4157 delete (*mat)->row_offsets; 4158 delete *mat; 4159 *mat = 0; 4160 } 4161 PetscFunctionReturn(PETSC_SUCCESS); 4162 } 4163 4164 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4165 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4166 { 4167 PetscFunctionBegin; 4168 if (*trifactor) { 4169 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4170 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4171 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4172 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4173 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4174 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4175 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4176 #endif 4177 PetscCall(PetscFree(*trifactor)); 4178 } 4179 PetscFunctionReturn(PETSC_SUCCESS); 4180 } 4181 #endif 4182 4183 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4184 { 4185 CsrMatrix *mat; 4186 4187 PetscFunctionBegin; 4188 if (*matstruct) { 4189 if ((*matstruct)->mat) { 4190 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4191 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4192 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4193 #else 4194 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4195 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4196 #endif 4197 } else { 4198 mat = (CsrMatrix *)(*matstruct)->mat; 4199 PetscCall(CsrMatrix_Destroy(&mat)); 4200 } 4201 } 4202 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4203 delete (*matstruct)->cprowIndices; 4204 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4205 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4206 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4207 4208 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4209 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4210 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4211 4212 for (int i = 0; i < 3; i++) { 4213 if (mdata->cuSpMV[i].initialized) { 4214 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4215 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4216 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4217 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4218 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4219 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4220 #endif 4221 } 4222 } 4223 #endif 4224 delete *matstruct; 4225 *matstruct = NULL; 4226 } 4227 PetscFunctionReturn(PETSC_SUCCESS); 4228 } 4229 4230 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4231 { 4232 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4233 4234 PetscFunctionBegin; 4235 if (fs) { 4236 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4237 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4238 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4239 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4240 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4241 delete fs->workVector; 4242 fs->workVector = NULL; 4243 #endif 4244 delete fs->rpermIndices; 4245 delete fs->cpermIndices; 4246 fs->rpermIndices = NULL; 4247 fs->cpermIndices = NULL; 4248 fs->init_dev_prop = PETSC_FALSE; 4249 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4250 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4251 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4252 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4253 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4254 PetscCallCUDA(cudaFree(fs->csrVal)); 4255 PetscCallCUDA(cudaFree(fs->diag)); 4256 PetscCallCUDA(cudaFree(fs->X)); 4257 PetscCallCUDA(cudaFree(fs->Y)); 4258 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4259 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4260 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4261 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4262 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4263 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4264 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4265 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4266 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4267 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4268 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4269 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4270 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4271 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4272 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4273 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4274 PetscCall(PetscFree(fs->csrRowPtr_h)); 4275 PetscCall(PetscFree(fs->csrVal_h)); 4276 PetscCall(PetscFree(fs->diag_h)); 4277 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4278 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4279 #endif 4280 } 4281 PetscFunctionReturn(PETSC_SUCCESS); 4282 } 4283 4284 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4285 { 4286 PetscFunctionBegin; 4287 if (*trifactors) { 4288 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4289 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4290 PetscCall(PetscFree(*trifactors)); 4291 } 4292 PetscFunctionReturn(PETSC_SUCCESS); 4293 } 4294 4295 struct IJCompare { 4296 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4297 { 4298 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4299 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4300 return false; 4301 } 4302 }; 4303 4304 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4305 { 4306 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4307 4308 PetscFunctionBegin; 4309 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4310 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4311 if (destroy) { 4312 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4313 delete cusp->csr2csc_i; 4314 cusp->csr2csc_i = NULL; 4315 } 4316 A->transupdated = PETSC_FALSE; 4317 PetscFunctionReturn(PETSC_SUCCESS); 4318 } 4319 4320 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4321 { 4322 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 4323 4324 PetscFunctionBegin; 4325 PetscCallCUDA(cudaFree(coo->perm)); 4326 PetscCallCUDA(cudaFree(coo->jmap)); 4327 PetscCall(PetscFree(coo)); 4328 PetscFunctionReturn(PETSC_SUCCESS); 4329 } 4330 4331 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4332 { 4333 PetscBool dev_ij = PETSC_FALSE; 4334 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4335 PetscInt *i, *j; 4336 PetscContainer container_h; 4337 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4338 4339 PetscFunctionBegin; 4340 PetscCall(PetscGetMemType(coo_i, &mtype)); 4341 if (PetscMemTypeDevice(mtype)) { 4342 dev_ij = PETSC_TRUE; 4343 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4344 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4345 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4346 } else { 4347 i = coo_i; 4348 j = coo_j; 4349 } 4350 4351 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4352 if (dev_ij) PetscCall(PetscFree2(i, j)); 4353 mat->offloadmask = PETSC_OFFLOAD_CPU; 4354 // Create the GPU memory 4355 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4356 4357 // Copy the COO struct to device 4358 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4359 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4360 PetscCall(PetscMalloc1(1, &coo_d)); 4361 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4362 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4363 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4364 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4365 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4366 4367 // Put the COO struct in a container and then attach that to the matrix 4368 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4369 PetscFunctionReturn(PETSC_SUCCESS); 4370 } 4371 4372 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4373 { 4374 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4375 const PetscCount grid_size = gridDim.x * blockDim.x; 4376 for (; i < nnz; i += grid_size) { 4377 PetscScalar sum = 0.0; 4378 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4379 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4380 } 4381 } 4382 4383 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4384 { 4385 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4386 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4387 PetscCount Annz = seq->nz; 4388 PetscMemType memtype; 4389 const PetscScalar *v1 = v; 4390 PetscScalar *Aa; 4391 PetscContainer container; 4392 MatCOOStruct_SeqAIJ *coo; 4393 4394 PetscFunctionBegin; 4395 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4396 4397 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4398 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4399 4400 PetscCall(PetscGetMemType(v, &memtype)); 4401 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4402 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4403 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4404 } 4405 4406 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4407 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4408 4409 PetscCall(PetscLogGpuTimeBegin()); 4410 if (Annz) { 4411 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4412 PetscCallCUDA(cudaPeekAtLastError()); 4413 } 4414 PetscCall(PetscLogGpuTimeEnd()); 4415 4416 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4417 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4418 4419 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4420 PetscFunctionReturn(PETSC_SUCCESS); 4421 } 4422 4423 /*@C 4424 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4425 4426 Not Collective 4427 4428 Input Parameters: 4429 + A - the matrix 4430 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4431 4432 Output Parameters: 4433 + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 4434 - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 4435 4436 Level: developer 4437 4438 Note: 4439 When compressed is true, the CSR structure does not contain empty rows 4440 4441 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4442 @*/ 4443 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4444 { 4445 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4446 CsrMatrix *csr; 4447 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4448 4449 PetscFunctionBegin; 4450 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4451 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4452 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4453 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4454 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4455 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4456 csr = (CsrMatrix *)cusp->mat->mat; 4457 if (i) { 4458 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4459 if (!cusp->rowoffsets_gpu) { 4460 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4461 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4462 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4463 } 4464 *i = cusp->rowoffsets_gpu->data().get(); 4465 } else *i = csr->row_offsets->data().get(); 4466 } 4467 if (j) *j = csr->column_indices->data().get(); 4468 PetscFunctionReturn(PETSC_SUCCESS); 4469 } 4470 4471 /*@C 4472 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4473 4474 Not Collective 4475 4476 Input Parameters: 4477 + A - the matrix 4478 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4479 . i - the CSR row pointers 4480 - j - the CSR column indices 4481 4482 Level: developer 4483 4484 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4485 @*/ 4486 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4487 { 4488 PetscFunctionBegin; 4489 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4490 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4491 if (i) *i = NULL; 4492 if (j) *j = NULL; 4493 (void)compressed; 4494 PetscFunctionReturn(PETSC_SUCCESS); 4495 } 4496 4497 /*@C 4498 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored 4499 4500 Not Collective 4501 4502 Input Parameter: 4503 . A - a `MATSEQAIJCUSPARSE` matrix 4504 4505 Output Parameter: 4506 . a - pointer to the device data 4507 4508 Level: developer 4509 4510 Note: 4511 Will trigger host-to-device copies if the most up-to-date matrix data is on the host 4512 4513 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4514 @*/ 4515 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4516 { 4517 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4518 CsrMatrix *csr; 4519 4520 PetscFunctionBegin; 4521 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4522 PetscAssertPointer(a, 2); 4523 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4524 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4525 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4526 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4527 csr = (CsrMatrix *)cusp->mat->mat; 4528 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4529 *a = csr->values->data().get(); 4530 PetscFunctionReturn(PETSC_SUCCESS); 4531 } 4532 4533 /*@C 4534 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4535 4536 Not Collective 4537 4538 Input Parameters: 4539 + A - a `MATSEQAIJCUSPARSE` matrix 4540 - a - pointer to the device data 4541 4542 Level: developer 4543 4544 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4545 @*/ 4546 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4547 { 4548 PetscFunctionBegin; 4549 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4550 PetscAssertPointer(a, 2); 4551 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4552 *a = NULL; 4553 PetscFunctionReturn(PETSC_SUCCESS); 4554 } 4555 4556 /*@C 4557 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4558 4559 Not Collective 4560 4561 Input Parameter: 4562 . A - a `MATSEQAIJCUSPARSE` matrix 4563 4564 Output Parameter: 4565 . a - pointer to the device data 4566 4567 Level: developer 4568 4569 Note: 4570 Will trigger host-to-device copies if the most up-to-date matrix data is on the host 4571 4572 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4573 @*/ 4574 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4575 { 4576 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4577 CsrMatrix *csr; 4578 4579 PetscFunctionBegin; 4580 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4581 PetscAssertPointer(a, 2); 4582 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4583 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4584 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4585 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4586 csr = (CsrMatrix *)cusp->mat->mat; 4587 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4588 *a = csr->values->data().get(); 4589 A->offloadmask = PETSC_OFFLOAD_GPU; 4590 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4591 PetscFunctionReturn(PETSC_SUCCESS); 4592 } 4593 /*@C 4594 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4595 4596 Not Collective 4597 4598 Input Parameters: 4599 + A - a `MATSEQAIJCUSPARSE` matrix 4600 - a - pointer to the device data 4601 4602 Level: developer 4603 4604 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4605 @*/ 4606 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4607 { 4608 PetscFunctionBegin; 4609 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4610 PetscAssertPointer(a, 2); 4611 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4612 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4613 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4614 *a = NULL; 4615 PetscFunctionReturn(PETSC_SUCCESS); 4616 } 4617 4618 /*@C 4619 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4620 4621 Not Collective 4622 4623 Input Parameter: 4624 . A - a `MATSEQAIJCUSPARSE` matrix 4625 4626 Output Parameter: 4627 . a - pointer to the device data 4628 4629 Level: developer 4630 4631 Note: 4632 Does not trigger any host to device copies. 4633 4634 It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current 4635 4636 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4637 @*/ 4638 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4639 { 4640 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4641 CsrMatrix *csr; 4642 4643 PetscFunctionBegin; 4644 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4645 PetscAssertPointer(a, 2); 4646 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4647 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4648 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4649 csr = (CsrMatrix *)cusp->mat->mat; 4650 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4651 *a = csr->values->data().get(); 4652 A->offloadmask = PETSC_OFFLOAD_GPU; 4653 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4654 PetscFunctionReturn(PETSC_SUCCESS); 4655 } 4656 4657 /*@C 4658 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4659 4660 Not Collective 4661 4662 Input Parameters: 4663 + A - a `MATSEQAIJCUSPARSE` matrix 4664 - a - pointer to the device data 4665 4666 Level: developer 4667 4668 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4669 @*/ 4670 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4671 { 4672 PetscFunctionBegin; 4673 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4674 PetscAssertPointer(a, 2); 4675 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4676 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4677 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4678 *a = NULL; 4679 PetscFunctionReturn(PETSC_SUCCESS); 4680 } 4681 4682 struct IJCompare4 { 4683 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4684 { 4685 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4686 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4687 return false; 4688 } 4689 }; 4690 4691 struct Shift { 4692 int _shift; 4693 4694 Shift(int shift) : _shift(shift) { } 4695 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4696 }; 4697 4698 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4699 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4700 { 4701 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4702 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4703 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4704 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4705 PetscInt Annz, Bnnz; 4706 cusparseStatus_t stat; 4707 PetscInt i, m, n, zero = 0; 4708 4709 PetscFunctionBegin; 4710 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4711 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4712 PetscAssertPointer(C, 4); 4713 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4714 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4715 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4716 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4717 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4718 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4719 if (reuse == MAT_INITIAL_MATRIX) { 4720 m = A->rmap->n; 4721 n = A->cmap->n + B->cmap->n; 4722 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4723 PetscCall(MatSetSizes(*C, m, n, m, n)); 4724 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4725 c = (Mat_SeqAIJ *)(*C)->data; 4726 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4727 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4728 Ccsr = new CsrMatrix; 4729 Cmat->cprowIndices = NULL; 4730 c->compressedrow.use = PETSC_FALSE; 4731 c->compressedrow.nrows = 0; 4732 c->compressedrow.i = NULL; 4733 c->compressedrow.rindex = NULL; 4734 Ccusp->workVector = NULL; 4735 Ccusp->nrows = m; 4736 Ccusp->mat = Cmat; 4737 Ccusp->mat->mat = Ccsr; 4738 Ccsr->num_rows = m; 4739 Ccsr->num_cols = n; 4740 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4741 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4742 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4743 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4744 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4745 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4746 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4747 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4748 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4749 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4750 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4751 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4752 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4753 4754 Acsr = (CsrMatrix *)Acusp->mat->mat; 4755 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4756 Annz = (PetscInt)Acsr->column_indices->size(); 4757 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4758 c->nz = Annz + Bnnz; 4759 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4760 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4761 Ccsr->values = new THRUSTARRAY(c->nz); 4762 Ccsr->num_entries = c->nz; 4763 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4764 if (c->nz) { 4765 auto Acoo = new THRUSTINTARRAY32(Annz); 4766 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4767 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4768 THRUSTINTARRAY32 *Aroff, *Broff; 4769 4770 if (a->compressedrow.use) { /* need full row offset */ 4771 if (!Acusp->rowoffsets_gpu) { 4772 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4773 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4774 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4775 } 4776 Aroff = Acusp->rowoffsets_gpu; 4777 } else Aroff = Acsr->row_offsets; 4778 if (b->compressedrow.use) { /* need full row offset */ 4779 if (!Bcusp->rowoffsets_gpu) { 4780 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4781 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4782 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4783 } 4784 Broff = Bcusp->rowoffsets_gpu; 4785 } else Broff = Bcsr->row_offsets; 4786 PetscCall(PetscLogGpuTimeBegin()); 4787 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4788 PetscCallCUSPARSE(stat); 4789 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4790 PetscCallCUSPARSE(stat); 4791 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4792 auto Aperm = thrust::make_constant_iterator(1); 4793 auto Bperm = thrust::make_constant_iterator(0); 4794 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4795 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4796 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4797 #else 4798 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4799 auto Bcib = Bcsr->column_indices->begin(); 4800 auto Bcie = Bcsr->column_indices->end(); 4801 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4802 #endif 4803 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4804 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4805 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4806 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4807 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4808 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4809 auto p1 = Ccusp->coords->begin(); 4810 auto p2 = Ccusp->coords->begin(); 4811 #if CCCL_VERSION >= 3001000 4812 cuda::std::advance(p2, Annz); 4813 #else 4814 thrust::advance(p2, Annz); 4815 #endif 4816 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4817 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4818 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4819 #endif 4820 auto cci = thrust::make_counting_iterator(zero); 4821 auto cce = thrust::make_counting_iterator(c->nz); 4822 #if 0 //Errors on SUMMIT cuda 11.1.0 4823 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4824 #else 4825 #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST) 4826 auto pred = thrust::identity<int>(); 4827 #else 4828 auto pred = cuda::std::identity(); 4829 #endif 4830 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4831 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4832 #endif 4833 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4834 PetscCallCUSPARSE(stat); 4835 PetscCall(PetscLogGpuTimeEnd()); 4836 delete wPerm; 4837 delete Acoo; 4838 delete Bcoo; 4839 delete Ccoo; 4840 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4841 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4842 PetscCallCUSPARSE(stat); 4843 #endif 4844 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4845 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4846 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4847 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4848 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4849 CsrMatrix *CcsrT = new CsrMatrix; 4850 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4851 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4852 4853 (*C)->form_explicit_transpose = PETSC_TRUE; 4854 (*C)->transupdated = PETSC_TRUE; 4855 Ccusp->rowoffsets_gpu = NULL; 4856 CmatT->cprowIndices = NULL; 4857 CmatT->mat = CcsrT; 4858 CcsrT->num_rows = n; 4859 CcsrT->num_cols = m; 4860 CcsrT->num_entries = c->nz; 4861 4862 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4863 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4864 CcsrT->values = new THRUSTARRAY(c->nz); 4865 4866 PetscCall(PetscLogGpuTimeBegin()); 4867 auto rT = CcsrT->row_offsets->begin(); 4868 if (AT) { 4869 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4870 #if CCCL_VERSION >= 3001000 4871 cuda::std::advance(rT, -1); 4872 #else 4873 thrust::advance(rT, -1); 4874 #endif 4875 } 4876 if (BT) { 4877 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4878 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4879 thrust::copy(titb, tite, rT); 4880 } 4881 auto cT = CcsrT->column_indices->begin(); 4882 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4883 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4884 auto vT = CcsrT->values->begin(); 4885 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4886 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4887 PetscCall(PetscLogGpuTimeEnd()); 4888 4889 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4890 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4891 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4892 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4893 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4894 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4895 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4896 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4897 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4898 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4899 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4900 PetscCallCUSPARSE(stat); 4901 #endif 4902 Ccusp->matTranspose = CmatT; 4903 } 4904 } 4905 4906 c->free_a = PETSC_TRUE; 4907 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4908 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4909 c->free_ij = PETSC_TRUE; 4910 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4911 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4912 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4913 ii = *Ccsr->row_offsets; 4914 jj = *Ccsr->column_indices; 4915 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4916 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4917 } else { 4918 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4919 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4920 } 4921 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4922 PetscCall(PetscMalloc1(m, &c->ilen)); 4923 PetscCall(PetscMalloc1(m, &c->imax)); 4924 c->maxnz = c->nz; 4925 c->nonzerorowcnt = 0; 4926 c->rmax = 0; 4927 for (i = 0; i < m; i++) { 4928 const PetscInt nn = c->i[i + 1] - c->i[i]; 4929 c->ilen[i] = c->imax[i] = nn; 4930 c->nonzerorowcnt += (PetscInt)!!nn; 4931 c->rmax = PetscMax(c->rmax, nn); 4932 } 4933 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4934 PetscCall(PetscMalloc1(c->nz, &c->a)); 4935 (*C)->nonzerostate++; 4936 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4937 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4938 Ccusp->nonzerostate = (*C)->nonzerostate; 4939 (*C)->preallocated = PETSC_TRUE; 4940 } else { 4941 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4942 c = (Mat_SeqAIJ *)(*C)->data; 4943 if (c->nz) { 4944 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4945 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4946 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4947 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4948 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4949 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4950 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4951 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4952 Acsr = (CsrMatrix *)Acusp->mat->mat; 4953 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4954 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4955 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4956 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4957 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4958 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4959 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4960 auto pmid = Ccusp->coords->begin(); 4961 #if CCCL_VERSION >= 3001000 4962 cuda::std::advance(pmid, Acsr->num_entries); 4963 #else 4964 thrust::advance(pmid, Acsr->num_entries); 4965 #endif 4966 PetscCall(PetscLogGpuTimeBegin()); 4967 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4968 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4969 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4970 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4971 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4972 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4973 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4974 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4975 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4976 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4977 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4978 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4979 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4980 auto vT = CcsrT->values->begin(); 4981 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4982 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4983 (*C)->transupdated = PETSC_TRUE; 4984 } 4985 PetscCall(PetscLogGpuTimeEnd()); 4986 } 4987 } 4988 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4989 (*C)->assembled = PETSC_TRUE; 4990 (*C)->was_assembled = PETSC_FALSE; 4991 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4992 PetscFunctionReturn(PETSC_SUCCESS); 4993 } 4994 4995 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4996 { 4997 bool dmem; 4998 const PetscScalar *av; 4999 5000 PetscFunctionBegin; 5001 dmem = isCudaMem(v); 5002 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5003 if (n && idx) { 5004 THRUSTINTARRAY widx(n); 5005 widx.assign(idx, idx + n); 5006 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5007 5008 THRUSTARRAY *w = NULL; 5009 thrust::device_ptr<PetscScalar> dv; 5010 if (dmem) { 5011 dv = thrust::device_pointer_cast(v); 5012 } else { 5013 w = new THRUSTARRAY(n); 5014 dv = w->data(); 5015 } 5016 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5017 5018 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5019 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5020 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5021 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5022 delete w; 5023 } else { 5024 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5025 } 5026 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5027 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5028 PetscFunctionReturn(PETSC_SUCCESS); 5029 } 5030