1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30 31 typedef enum { 32 CUSPARSE_MV_ALG_DEFAULT = 0, 33 CUSPARSE_COOMV_ALG = 1, 34 CUSPARSE_CSRMV_ALG1 = 2, 35 CUSPARSE_CSRMV_ALG2 = 3 36 } cusparseSpMVAlg_t; 37 38 typedef enum { 39 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44 CUSPARSE_SPMM_ALG_DEFAULT = 0, 45 CUSPARSE_SPMM_COO_ALG1 = 1, 46 CUSPARSE_SPMM_COO_ALG2 = 2, 47 CUSPARSE_SPMM_COO_ALG3 = 3, 48 CUSPARSE_SPMM_COO_ALG4 = 5, 49 CUSPARSE_SPMM_CSR_ALG1 = 4, 50 CUSPARSE_SPMM_CSR_ALG2 = 6, 51 } cusparseSpMMAlg_t; 52 53 typedef enum { 54 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 55 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 56 } cusparseCsr2CscAlg_t; 57 */ 58 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61 #endif 62 63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 67 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 73 #endif 74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 84 85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 87 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 88 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 89 90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 92 93 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 94 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 95 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 96 97 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 98 { 99 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 100 101 PetscFunctionBegin; 102 switch (op) { 103 case MAT_CUSPARSE_MULT: 104 cusparsestruct->format = format; 105 break; 106 case MAT_CUSPARSE_ALL: 107 cusparsestruct->format = format; 108 break; 109 default: 110 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 111 } 112 PetscFunctionReturn(PETSC_SUCCESS); 113 } 114 115 /*@ 116 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 117 operation. Only the `MatMult()` operation can use different GPU storage formats 118 119 Not Collective 120 121 Input Parameters: 122 + A - Matrix of type `MATSEQAIJCUSPARSE` 123 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 124 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 125 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 126 127 Level: intermediate 128 129 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 130 @*/ 131 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 132 { 133 PetscFunctionBegin; 134 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 135 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 140 { 141 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 142 143 PetscFunctionBegin; 144 cusparsestruct->use_cpu_solve = use_cpu; 145 PetscFunctionReturn(PETSC_SUCCESS); 146 } 147 148 /*@ 149 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 150 151 Input Parameters: 152 + A - Matrix of type `MATSEQAIJCUSPARSE` 153 - use_cpu - set flag for using the built-in CPU `MatSolve()` 154 155 Level: intermediate 156 157 Note: 158 The cuSparse LU solver currently computes the factors with the built-in CPU method 159 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 160 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 161 162 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 163 @*/ 164 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 165 { 166 PetscFunctionBegin; 167 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 168 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171 172 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 173 { 174 PetscFunctionBegin; 175 switch (op) { 176 case MAT_FORM_EXPLICIT_TRANSPOSE: 177 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 178 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 179 A->form_explicit_transpose = flg; 180 break; 181 default: 182 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 183 break; 184 } 185 PetscFunctionReturn(PETSC_SUCCESS); 186 } 187 188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 189 { 190 MatCUSPARSEStorageFormat format; 191 PetscBool flg; 192 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 193 194 PetscFunctionBegin; 195 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 196 if (A->factortype == MAT_FACTOR_NONE) { 197 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 198 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 199 200 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 201 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 202 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 203 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 205 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 206 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 207 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 208 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 209 #else 210 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 211 #endif 212 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 213 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 214 215 PetscCall( 216 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 217 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 218 #endif 219 } 220 PetscOptionsHeadEnd(); 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 225 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 226 { 227 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 228 PetscInt m = A->rmap->n; 229 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 230 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 231 const MatScalar *Aa = a->a; 232 PetscInt *Mi, *Mj, Mnz; 233 PetscScalar *Ma; 234 235 PetscFunctionBegin; 236 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 237 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 238 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 239 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 240 PetscCall(PetscMalloc1(m + 1, &Mi)); 241 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 242 PetscCall(PetscMalloc1(Mnz, &Ma)); 243 Mi[0] = 0; 244 for (PetscInt i = 0; i < m; i++) { 245 PetscInt llen = Ai[i + 1] - Ai[i]; 246 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 247 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 248 Mj[Mi[i] + llen] = i; // diagonal entry 249 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 250 Mi[i + 1] = Mi[i] + llen + ulen; 251 } 252 // Copy M (L,U) from host to device 253 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 254 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 255 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 256 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 257 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 258 259 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 260 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 261 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 262 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 263 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 264 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 265 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 266 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 267 268 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 269 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 270 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 271 272 fillMode = CUSPARSE_FILL_MODE_UPPER; 273 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 274 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 275 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 276 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 277 278 // Allocate work vectors in SpSv 279 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 280 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 281 282 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 283 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 284 285 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 286 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 287 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 288 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 289 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 290 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 291 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 292 293 // Record for reuse 294 fs->csrRowPtr_h = Mi; 295 fs->csrVal_h = Ma; 296 PetscCall(PetscFree(Mj)); 297 } 298 // Copy the value 299 Mi = fs->csrRowPtr_h; 300 Ma = fs->csrVal_h; 301 Mnz = Mi[m]; 302 for (PetscInt i = 0; i < m; i++) { 303 PetscInt llen = Ai[i + 1] - Ai[i]; 304 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 305 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 306 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 307 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 308 } 309 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 310 311 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 312 if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 313 // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 314 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 315 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 316 } else 317 #endif 318 { 319 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 320 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 321 322 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 323 fs->updatedSpSVAnalysis = PETSC_TRUE; 324 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 325 } 326 } 327 PetscFunctionReturn(PETSC_SUCCESS); 328 } 329 #else 330 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 331 { 332 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 333 PetscInt n = A->rmap->n; 334 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 335 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 336 const PetscInt *ai = a->i, *aj = a->j, *vi; 337 const MatScalar *aa = a->a, *v; 338 PetscInt *AiLo, *AjLo; 339 PetscInt i, nz, nzLower, offset, rowOffset; 340 341 PetscFunctionBegin; 342 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 343 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 344 try { 345 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 346 nzLower = n + ai[n] - ai[1]; 347 if (!loTriFactor) { 348 PetscScalar *AALo; 349 350 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 351 352 /* Allocate Space for the lower triangular matrix */ 353 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 354 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 355 356 /* Fill the lower triangular matrix */ 357 AiLo[0] = (PetscInt)0; 358 AiLo[n] = nzLower; 359 AjLo[0] = (PetscInt)0; 360 AALo[0] = (MatScalar)1.0; 361 v = aa; 362 vi = aj; 363 offset = 1; 364 rowOffset = 1; 365 for (i = 1; i < n; i++) { 366 nz = ai[i + 1] - ai[i]; 367 /* additional 1 for the term on the diagonal */ 368 AiLo[i] = rowOffset; 369 rowOffset += nz + 1; 370 371 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 372 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 373 374 offset += nz; 375 AjLo[offset] = (PetscInt)i; 376 AALo[offset] = (MatScalar)1.0; 377 offset += 1; 378 379 v += nz; 380 vi += nz; 381 } 382 383 /* allocate space for the triangular factor information */ 384 PetscCall(PetscNew(&loTriFactor)); 385 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 386 /* Create the matrix description */ 387 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 388 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 389 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 390 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 391 #else 392 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 393 #endif 394 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 395 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 396 397 /* set the operation */ 398 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 399 400 /* set the matrix */ 401 loTriFactor->csrMat = new CsrMatrix; 402 loTriFactor->csrMat->num_rows = n; 403 loTriFactor->csrMat->num_cols = n; 404 loTriFactor->csrMat->num_entries = nzLower; 405 406 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 407 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 408 409 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 410 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 411 412 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 413 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 414 415 /* Create the solve analysis information */ 416 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 417 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 418 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 419 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 420 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 421 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 422 #endif 423 424 /* perform the solve analysis */ 425 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 426 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 427 PetscCallCUDA(WaitForCUDA()); 428 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 429 430 /* assign the pointer */ 431 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 432 loTriFactor->AA_h = AALo; 433 PetscCallCUDA(cudaFreeHost(AiLo)); 434 PetscCallCUDA(cudaFreeHost(AjLo)); 435 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 436 } else { /* update values only */ 437 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 438 /* Fill the lower triangular matrix */ 439 loTriFactor->AA_h[0] = 1.0; 440 v = aa; 441 vi = aj; 442 offset = 1; 443 for (i = 1; i < n; i++) { 444 nz = ai[i + 1] - ai[i]; 445 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 446 offset += nz; 447 loTriFactor->AA_h[offset] = 1.0; 448 offset += 1; 449 v += nz; 450 } 451 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 452 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 453 } 454 } catch (char *ex) { 455 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 456 } 457 } 458 PetscFunctionReturn(PETSC_SUCCESS); 459 } 460 461 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 462 { 463 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 464 PetscInt n = A->rmap->n; 465 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 466 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 467 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 468 const MatScalar *aa = a->a, *v; 469 PetscInt *AiUp, *AjUp; 470 PetscInt i, nz, nzUpper, offset; 471 472 PetscFunctionBegin; 473 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 474 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 475 try { 476 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 477 nzUpper = adiag[0] - adiag[n]; 478 if (!upTriFactor) { 479 PetscScalar *AAUp; 480 481 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 482 483 /* Allocate Space for the upper triangular matrix */ 484 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 485 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 486 487 /* Fill the upper triangular matrix */ 488 AiUp[0] = (PetscInt)0; 489 AiUp[n] = nzUpper; 490 offset = nzUpper; 491 for (i = n - 1; i >= 0; i--) { 492 v = aa + adiag[i + 1] + 1; 493 vi = aj + adiag[i + 1] + 1; 494 495 /* number of elements NOT on the diagonal */ 496 nz = adiag[i] - adiag[i + 1] - 1; 497 498 /* decrement the offset */ 499 offset -= (nz + 1); 500 501 /* first, set the diagonal elements */ 502 AjUp[offset] = (PetscInt)i; 503 AAUp[offset] = (MatScalar)1. / v[nz]; 504 AiUp[i] = AiUp[i + 1] - (nz + 1); 505 506 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 507 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 508 } 509 510 /* allocate space for the triangular factor information */ 511 PetscCall(PetscNew(&upTriFactor)); 512 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 513 514 /* Create the matrix description */ 515 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 516 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 517 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 518 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 519 #else 520 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 521 #endif 522 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 523 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 524 525 /* set the operation */ 526 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 527 528 /* set the matrix */ 529 upTriFactor->csrMat = new CsrMatrix; 530 upTriFactor->csrMat->num_rows = n; 531 upTriFactor->csrMat->num_cols = n; 532 upTriFactor->csrMat->num_entries = nzUpper; 533 534 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 535 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 536 537 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 538 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 539 540 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 541 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 542 543 /* Create the solve analysis information */ 544 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 545 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 546 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 547 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 548 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 549 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 550 #endif 551 552 /* perform the solve analysis */ 553 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 554 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 555 556 PetscCallCUDA(WaitForCUDA()); 557 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 558 559 /* assign the pointer */ 560 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 561 upTriFactor->AA_h = AAUp; 562 PetscCallCUDA(cudaFreeHost(AiUp)); 563 PetscCallCUDA(cudaFreeHost(AjUp)); 564 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 565 } else { 566 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 567 /* Fill the upper triangular matrix */ 568 offset = nzUpper; 569 for (i = n - 1; i >= 0; i--) { 570 v = aa + adiag[i + 1] + 1; 571 572 /* number of elements NOT on the diagonal */ 573 nz = adiag[i] - adiag[i + 1] - 1; 574 575 /* decrement the offset */ 576 offset -= (nz + 1); 577 578 /* first, set the diagonal elements */ 579 upTriFactor->AA_h[offset] = 1. / v[nz]; 580 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 581 } 582 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 583 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 584 } 585 } catch (char *ex) { 586 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 587 } 588 } 589 PetscFunctionReturn(PETSC_SUCCESS); 590 } 591 #endif 592 593 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 594 { 595 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 596 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 597 IS isrow = a->row, isicol = a->icol; 598 PetscBool row_identity, col_identity; 599 PetscInt n = A->rmap->n; 600 601 PetscFunctionBegin; 602 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 603 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 604 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 605 #else 606 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 607 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 608 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 609 #endif 610 611 cusparseTriFactors->nnz = a->nz; 612 613 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 614 /* lower triangular indices */ 615 PetscCall(ISIdentity(isrow, &row_identity)); 616 if (!row_identity && !cusparseTriFactors->rpermIndices) { 617 const PetscInt *r; 618 619 PetscCall(ISGetIndices(isrow, &r)); 620 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 621 cusparseTriFactors->rpermIndices->assign(r, r + n); 622 PetscCall(ISRestoreIndices(isrow, &r)); 623 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 624 } 625 626 /* upper triangular indices */ 627 PetscCall(ISIdentity(isicol, &col_identity)); 628 if (!col_identity && !cusparseTriFactors->cpermIndices) { 629 const PetscInt *c; 630 631 PetscCall(ISGetIndices(isicol, &c)); 632 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 633 cusparseTriFactors->cpermIndices->assign(c, c + n); 634 PetscCall(ISRestoreIndices(isicol, &c)); 635 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 636 } 637 PetscFunctionReturn(PETSC_SUCCESS); 638 } 639 640 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 641 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 642 { 643 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 644 PetscInt m = A->rmap->n; 645 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 646 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 647 const MatScalar *Aa = a->a; 648 PetscInt *Mj, Mnz; 649 PetscScalar *Ma, *D; 650 651 PetscFunctionBegin; 652 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 653 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 654 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 655 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 656 Mnz = Ai[m]; // Unz (with the unit diagonal) 657 PetscCall(PetscMalloc1(Mnz, &Ma)); 658 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 659 PetscCall(PetscMalloc1(m, &D)); // the diagonal 660 for (PetscInt i = 0; i < m; i++) { 661 PetscInt ulen = Ai[i + 1] - Ai[i]; 662 Mj[Ai[i]] = i; // diagonal entry 663 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 664 } 665 // Copy M (U) from host to device 666 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 667 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 668 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 669 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 670 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 671 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 672 673 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 674 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 675 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 676 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 677 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 678 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 679 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 680 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 681 682 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 683 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 684 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 685 686 // Allocate work vectors in SpSv 687 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 688 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 689 690 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 691 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 692 693 // Query buffer sizes for SpSV and then allocate buffers 694 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 695 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 696 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 697 698 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 699 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 700 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 701 702 // Record for reuse 703 fs->csrVal_h = Ma; 704 fs->diag_h = D; 705 PetscCall(PetscFree(Mj)); 706 } 707 // Copy the value 708 Ma = fs->csrVal_h; 709 D = fs->diag_h; 710 Mnz = Ai[m]; 711 for (PetscInt i = 0; i < m; i++) { 712 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 713 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 714 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 715 } 716 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 717 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 718 719 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 720 if (fs->updatedSpSVAnalysis) { 721 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 722 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 723 } else 724 #endif 725 { 726 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 727 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 728 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 729 fs->updatedSpSVAnalysis = PETSC_TRUE; 730 } 731 } 732 PetscFunctionReturn(PETSC_SUCCESS); 733 } 734 735 // Solve Ut D U x = b 736 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 737 { 738 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 739 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 740 const PetscScalar *barray; 741 PetscScalar *xarray; 742 thrust::device_ptr<const PetscScalar> bGPU; 743 thrust::device_ptr<PetscScalar> xGPU; 744 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 745 PetscInt m = A->rmap->n; 746 747 PetscFunctionBegin; 748 PetscCall(PetscLogGpuTimeBegin()); 749 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 750 PetscCall(VecCUDAGetArrayRead(b, &barray)); 751 xGPU = thrust::device_pointer_cast(xarray); 752 bGPU = thrust::device_pointer_cast(barray); 753 754 // Reorder b with the row permutation if needed, and wrap the result in fs->X 755 if (fs->rpermIndices) { 756 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 757 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 758 } else { 759 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 760 } 761 762 // Solve Ut Y = X 763 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 764 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 765 766 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 767 // It is basically a vector element-wise multiplication, but cublas does not have it! 768 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 769 770 // Solve U X = Y 771 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 772 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 773 } else { 774 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 775 } 776 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 777 778 // Reorder X with the column permutation if needed, and put the result back to x 779 if (fs->cpermIndices) { 780 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 781 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 782 } 783 784 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 785 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 786 PetscCall(PetscLogGpuTimeEnd()); 787 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 788 PetscFunctionReturn(PETSC_SUCCESS); 789 } 790 #else 791 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 792 { 793 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 794 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 795 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 796 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 797 PetscInt *AiUp, *AjUp; 798 PetscScalar *AAUp; 799 PetscScalar *AALo; 800 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 801 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 802 const PetscInt *ai = b->i, *aj = b->j, *vj; 803 const MatScalar *aa = b->a, *v; 804 805 PetscFunctionBegin; 806 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 807 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 808 try { 809 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 810 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 811 if (!upTriFactor && !loTriFactor) { 812 /* Allocate Space for the upper triangular matrix */ 813 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 814 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 815 816 /* Fill the upper triangular matrix */ 817 AiUp[0] = (PetscInt)0; 818 AiUp[n] = nzUpper; 819 offset = 0; 820 for (i = 0; i < n; i++) { 821 /* set the pointers */ 822 v = aa + ai[i]; 823 vj = aj + ai[i]; 824 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 825 826 /* first, set the diagonal elements */ 827 AjUp[offset] = (PetscInt)i; 828 AAUp[offset] = (MatScalar)1.0 / v[nz]; 829 AiUp[i] = offset; 830 AALo[offset] = (MatScalar)1.0 / v[nz]; 831 832 offset += 1; 833 if (nz > 0) { 834 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 835 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 836 for (j = offset; j < offset + nz; j++) { 837 AAUp[j] = -AAUp[j]; 838 AALo[j] = AAUp[j] / v[nz]; 839 } 840 offset += nz; 841 } 842 } 843 844 /* allocate space for the triangular factor information */ 845 PetscCall(PetscNew(&upTriFactor)); 846 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 847 848 /* Create the matrix description */ 849 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 850 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 851 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 852 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 853 #else 854 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 855 #endif 856 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 857 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 858 859 /* set the matrix */ 860 upTriFactor->csrMat = new CsrMatrix; 861 upTriFactor->csrMat->num_rows = A->rmap->n; 862 upTriFactor->csrMat->num_cols = A->cmap->n; 863 upTriFactor->csrMat->num_entries = a->nz; 864 865 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 866 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 867 868 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 869 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 870 871 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 872 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 873 874 /* set the operation */ 875 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 876 877 /* Create the solve analysis information */ 878 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 879 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 880 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 881 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 882 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 883 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 884 #endif 885 886 /* perform the solve analysis */ 887 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 888 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 889 890 PetscCallCUDA(WaitForCUDA()); 891 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 892 893 /* assign the pointer */ 894 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 895 896 /* allocate space for the triangular factor information */ 897 PetscCall(PetscNew(&loTriFactor)); 898 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 899 900 /* Create the matrix description */ 901 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 902 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 903 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 904 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 905 #else 906 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 907 #endif 908 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 909 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 910 911 /* set the operation */ 912 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 913 914 /* set the matrix */ 915 loTriFactor->csrMat = new CsrMatrix; 916 loTriFactor->csrMat->num_rows = A->rmap->n; 917 loTriFactor->csrMat->num_cols = A->cmap->n; 918 loTriFactor->csrMat->num_entries = a->nz; 919 920 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 921 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 922 923 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 924 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 925 926 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 927 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 928 929 /* Create the solve analysis information */ 930 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 931 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 932 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 933 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 934 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 935 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 936 #endif 937 938 /* perform the solve analysis */ 939 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 940 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 941 942 PetscCallCUDA(WaitForCUDA()); 943 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 944 945 /* assign the pointer */ 946 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 947 948 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 949 PetscCallCUDA(cudaFreeHost(AiUp)); 950 PetscCallCUDA(cudaFreeHost(AjUp)); 951 } else { 952 /* Fill the upper triangular matrix */ 953 offset = 0; 954 for (i = 0; i < n; i++) { 955 /* set the pointers */ 956 v = aa + ai[i]; 957 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 958 959 /* first, set the diagonal elements */ 960 AAUp[offset] = 1.0 / v[nz]; 961 AALo[offset] = 1.0 / v[nz]; 962 963 offset += 1; 964 if (nz > 0) { 965 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 966 for (j = offset; j < offset + nz; j++) { 967 AAUp[j] = -AAUp[j]; 968 AALo[j] = AAUp[j] / v[nz]; 969 } 970 offset += nz; 971 } 972 } 973 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 974 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 975 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 976 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 977 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 978 } 979 PetscCallCUDA(cudaFreeHost(AAUp)); 980 PetscCallCUDA(cudaFreeHost(AALo)); 981 } catch (char *ex) { 982 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 983 } 984 } 985 PetscFunctionReturn(PETSC_SUCCESS); 986 } 987 #endif 988 989 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 990 { 991 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 992 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 993 IS ip = a->row; 994 PetscBool perm_identity; 995 PetscInt n = A->rmap->n; 996 997 PetscFunctionBegin; 998 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 999 1000 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1001 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 1002 #else 1003 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 1004 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 1005 #endif 1006 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 1007 1008 A->offloadmask = PETSC_OFFLOAD_BOTH; 1009 1010 /* lower triangular indices */ 1011 PetscCall(ISIdentity(ip, &perm_identity)); 1012 if (!perm_identity) { 1013 IS iip; 1014 const PetscInt *irip, *rip; 1015 1016 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 1017 PetscCall(ISGetIndices(iip, &irip)); 1018 PetscCall(ISGetIndices(ip, &rip)); 1019 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1020 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1021 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1022 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1023 PetscCall(ISRestoreIndices(iip, &irip)); 1024 PetscCall(ISDestroy(&iip)); 1025 PetscCall(ISRestoreIndices(ip, &rip)); 1026 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1027 } 1028 PetscFunctionReturn(PETSC_SUCCESS); 1029 } 1030 1031 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1032 { 1033 PetscFunctionBegin; 1034 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1035 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1036 B->offloadmask = PETSC_OFFLOAD_CPU; 1037 1038 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1039 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1040 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1041 #else 1042 /* determine which version of MatSolve needs to be used. */ 1043 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1044 IS ip = b->row; 1045 PetscBool perm_identity; 1046 1047 PetscCall(ISIdentity(ip, &perm_identity)); 1048 if (perm_identity) { 1049 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1050 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1051 } else { 1052 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1053 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1054 } 1055 #endif 1056 B->ops->matsolve = NULL; 1057 B->ops->matsolvetranspose = NULL; 1058 1059 /* get the triangular factors */ 1060 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1061 PetscFunctionReturn(PETSC_SUCCESS); 1062 } 1063 1064 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1065 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1066 { 1067 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1068 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1069 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1070 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1071 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1072 cusparseIndexBase_t indexBase; 1073 cusparseMatrixType_t matrixType; 1074 cusparseFillMode_t fillMode; 1075 cusparseDiagType_t diagType; 1076 1077 PetscFunctionBegin; 1078 /* allocate space for the transpose of the lower triangular factor */ 1079 PetscCall(PetscNew(&loTriFactorT)); 1080 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1081 1082 /* set the matrix descriptors of the lower triangular factor */ 1083 matrixType = cusparseGetMatType(loTriFactor->descr); 1084 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1085 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1086 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1087 1088 /* Create the matrix description */ 1089 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1090 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1091 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1092 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1093 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1094 1095 /* set the operation */ 1096 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1097 1098 /* allocate GPU space for the CSC of the lower triangular factor*/ 1099 loTriFactorT->csrMat = new CsrMatrix; 1100 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1101 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1102 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1103 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1104 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1105 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1106 1107 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1108 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1109 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1110 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1111 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1112 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1113 #endif 1114 1115 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1116 { 1117 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1118 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1119 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1120 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1121 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1122 #else 1123 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1124 #endif 1125 PetscCallCUSPARSE(stat); 1126 } 1127 1128 PetscCallCUDA(WaitForCUDA()); 1129 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1130 1131 /* Create the solve analysis information */ 1132 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1133 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1134 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1135 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1136 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1137 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1138 #endif 1139 1140 /* perform the solve analysis */ 1141 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1142 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1143 1144 PetscCallCUDA(WaitForCUDA()); 1145 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1146 1147 /* assign the pointer */ 1148 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1149 1150 /*********************************************/ 1151 /* Now the Transpose of the Upper Tri Factor */ 1152 /*********************************************/ 1153 1154 /* allocate space for the transpose of the upper triangular factor */ 1155 PetscCall(PetscNew(&upTriFactorT)); 1156 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1157 1158 /* set the matrix descriptors of the upper triangular factor */ 1159 matrixType = cusparseGetMatType(upTriFactor->descr); 1160 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1161 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1162 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1163 1164 /* Create the matrix description */ 1165 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1166 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1167 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1168 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1169 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1170 1171 /* set the operation */ 1172 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1173 1174 /* allocate GPU space for the CSC of the upper triangular factor*/ 1175 upTriFactorT->csrMat = new CsrMatrix; 1176 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1177 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1178 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1179 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1180 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1181 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1182 1183 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1184 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1185 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1186 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1187 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1188 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1189 #endif 1190 1191 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1192 { 1193 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1194 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1195 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1196 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1197 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1198 #else 1199 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1200 #endif 1201 PetscCallCUSPARSE(stat); 1202 } 1203 1204 PetscCallCUDA(WaitForCUDA()); 1205 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1206 1207 /* Create the solve analysis information */ 1208 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1209 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1210 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1211 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1212 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1213 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1214 #endif 1215 1216 /* perform the solve analysis */ 1217 /* christ, would it have killed you to put this stuff in a function????????? */ 1218 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1219 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1220 1221 PetscCallCUDA(WaitForCUDA()); 1222 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1223 1224 /* assign the pointer */ 1225 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1226 PetscFunctionReturn(PETSC_SUCCESS); 1227 } 1228 #endif 1229 1230 struct PetscScalarToPetscInt { 1231 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1232 }; 1233 1234 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1235 { 1236 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1237 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1238 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1239 cusparseStatus_t stat; 1240 cusparseIndexBase_t indexBase; 1241 1242 PetscFunctionBegin; 1243 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1244 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1245 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1246 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1247 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1248 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1249 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1250 PetscCall(PetscLogGpuTimeBegin()); 1251 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1252 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1253 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1254 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1255 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1256 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1257 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1258 1259 /* set alpha and beta */ 1260 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1261 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1262 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1263 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1264 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1265 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1266 1267 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1268 CsrMatrix *matrixT = new CsrMatrix; 1269 matstructT->mat = matrixT; 1270 matrixT->num_rows = A->cmap->n; 1271 matrixT->num_cols = A->rmap->n; 1272 matrixT->num_entries = a->nz; 1273 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1274 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1275 matrixT->values = new THRUSTARRAY(a->nz); 1276 1277 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1278 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1279 1280 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1281 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1282 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1283 indexBase, cusparse_scalartype); 1284 PetscCallCUSPARSE(stat); 1285 #else 1286 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1287 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1288 1289 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1290 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1291 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1292 */ 1293 if (matrixT->num_entries) { 1294 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1295 PetscCallCUSPARSE(stat); 1296 1297 } else { 1298 matstructT->matDescr = NULL; 1299 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1300 } 1301 #endif 1302 #endif 1303 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1304 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1305 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1306 #else 1307 CsrMatrix *temp = new CsrMatrix; 1308 CsrMatrix *tempT = new CsrMatrix; 1309 /* First convert HYB to CSR */ 1310 temp->num_rows = A->rmap->n; 1311 temp->num_cols = A->cmap->n; 1312 temp->num_entries = a->nz; 1313 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1314 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1315 temp->values = new THRUSTARRAY(a->nz); 1316 1317 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1318 PetscCallCUSPARSE(stat); 1319 1320 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1321 tempT->num_rows = A->rmap->n; 1322 tempT->num_cols = A->cmap->n; 1323 tempT->num_entries = a->nz; 1324 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1325 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1326 tempT->values = new THRUSTARRAY(a->nz); 1327 1328 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1329 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1330 PetscCallCUSPARSE(stat); 1331 1332 /* Last, convert CSC to HYB */ 1333 cusparseHybMat_t hybMat; 1334 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1335 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1336 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1337 PetscCallCUSPARSE(stat); 1338 1339 /* assign the pointer */ 1340 matstructT->mat = hybMat; 1341 A->transupdated = PETSC_TRUE; 1342 /* delete temporaries */ 1343 if (tempT) { 1344 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1345 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1346 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1347 delete (CsrMatrix *)tempT; 1348 } 1349 if (temp) { 1350 if (temp->values) delete (THRUSTARRAY *)temp->values; 1351 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1352 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1353 delete (CsrMatrix *)temp; 1354 } 1355 #endif 1356 } 1357 } 1358 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1359 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1360 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1361 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1362 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1363 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1364 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1365 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1366 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1367 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1368 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1369 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1370 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1371 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1372 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1373 } 1374 if (!cusparsestruct->csr2csc_i) { 1375 THRUSTARRAY csr2csc_a(matrix->num_entries); 1376 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1377 1378 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1380 void *csr2cscBuffer; 1381 size_t csr2cscBufferSize; 1382 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1383 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1384 PetscCallCUSPARSE(stat); 1385 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1386 #endif 1387 1388 if (matrix->num_entries) { 1389 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1390 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1391 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1392 1393 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1394 should be filled with indexBase. So I just take a shortcut here. 1395 */ 1396 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1397 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1398 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1399 PetscCallCUSPARSE(stat); 1400 #else 1401 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1402 PetscCallCUSPARSE(stat); 1403 #endif 1404 } else { 1405 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1406 } 1407 1408 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1409 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1410 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1411 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1412 #endif 1413 } 1414 PetscCallThrust( 1415 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1416 } 1417 PetscCall(PetscLogGpuTimeEnd()); 1418 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1419 /* the compressed row indices is not used for matTranspose */ 1420 matstructT->cprowIndices = NULL; 1421 /* assign the pointer */ 1422 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1423 A->transupdated = PETSC_TRUE; 1424 PetscFunctionReturn(PETSC_SUCCESS); 1425 } 1426 1427 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1428 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1429 { 1430 const PetscScalar *barray; 1431 PetscScalar *xarray; 1432 thrust::device_ptr<const PetscScalar> bGPU; 1433 thrust::device_ptr<PetscScalar> xGPU; 1434 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1435 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1436 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1437 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1438 PetscInt m = A->rmap->n; 1439 1440 PetscFunctionBegin; 1441 PetscCall(PetscLogGpuTimeBegin()); 1442 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1443 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1444 xGPU = thrust::device_pointer_cast(xarray); 1445 bGPU = thrust::device_pointer_cast(barray); 1446 1447 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1448 if (fs->rpermIndices) { 1449 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1450 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1451 } else { 1452 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1453 } 1454 1455 // Solve L Y = X 1456 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1457 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1458 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1459 1460 // Solve U X = Y 1461 if (fs->cpermIndices) { 1462 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1463 } else { 1464 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1465 } 1466 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1467 1468 // Reorder X with the column permutation if needed, and put the result back to x 1469 if (fs->cpermIndices) { 1470 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1471 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1472 } 1473 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1474 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1475 PetscCall(PetscLogGpuTimeEnd()); 1476 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1477 PetscFunctionReturn(PETSC_SUCCESS); 1478 } 1479 1480 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1481 { 1482 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1483 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1484 const PetscScalar *barray; 1485 PetscScalar *xarray; 1486 thrust::device_ptr<const PetscScalar> bGPU; 1487 thrust::device_ptr<PetscScalar> xGPU; 1488 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1489 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1490 PetscInt m = A->rmap->n; 1491 1492 PetscFunctionBegin; 1493 PetscCall(PetscLogGpuTimeBegin()); 1494 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1495 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1496 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1497 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1498 1499 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1500 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1501 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1502 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1503 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1504 } 1505 1506 if (!fs->updatedTransposeSpSVAnalysis) { 1507 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1508 1509 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1510 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1511 } 1512 1513 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1514 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1515 xGPU = thrust::device_pointer_cast(xarray); 1516 bGPU = thrust::device_pointer_cast(barray); 1517 1518 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1519 if (fs->rpermIndices) { 1520 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1521 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1522 } else { 1523 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1524 } 1525 1526 // Solve Ut Y = X 1527 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1528 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1529 1530 // Solve Lt X = Y 1531 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1532 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1533 } else { 1534 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1535 } 1536 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1537 1538 // Reorder X with the column permutation if needed, and put the result back to x 1539 if (fs->cpermIndices) { 1540 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1541 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1542 } 1543 1544 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1545 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1546 PetscCall(PetscLogGpuTimeEnd()); 1547 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1548 PetscFunctionReturn(PETSC_SUCCESS); 1549 } 1550 #else 1551 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1552 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1553 { 1554 PetscInt n = xx->map->n; 1555 const PetscScalar *barray; 1556 PetscScalar *xarray; 1557 thrust::device_ptr<const PetscScalar> bGPU; 1558 thrust::device_ptr<PetscScalar> xGPU; 1559 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1560 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1561 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1562 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1563 1564 PetscFunctionBegin; 1565 /* Analyze the matrix and create the transpose ... on the fly */ 1566 if (!loTriFactorT && !upTriFactorT) { 1567 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1568 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1569 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1570 } 1571 1572 /* Get the GPU pointers */ 1573 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1574 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1575 xGPU = thrust::device_pointer_cast(xarray); 1576 bGPU = thrust::device_pointer_cast(barray); 1577 1578 PetscCall(PetscLogGpuTimeBegin()); 1579 /* First, reorder with the row permutation */ 1580 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1581 1582 /* First, solve U */ 1583 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1584 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1585 1586 /* Then, solve L */ 1587 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1588 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1589 1590 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1591 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1592 1593 /* Copy the temporary to the full solution. */ 1594 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1595 1596 /* restore */ 1597 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1598 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1599 PetscCall(PetscLogGpuTimeEnd()); 1600 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1601 PetscFunctionReturn(PETSC_SUCCESS); 1602 } 1603 1604 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1605 { 1606 const PetscScalar *barray; 1607 PetscScalar *xarray; 1608 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1609 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1610 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1611 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1612 1613 PetscFunctionBegin; 1614 /* Analyze the matrix and create the transpose ... on the fly */ 1615 if (!loTriFactorT && !upTriFactorT) { 1616 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1617 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1618 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1619 } 1620 1621 /* Get the GPU pointers */ 1622 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1623 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1624 1625 PetscCall(PetscLogGpuTimeBegin()); 1626 /* First, solve U */ 1627 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1628 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1629 1630 /* Then, solve L */ 1631 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1632 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1633 1634 /* restore */ 1635 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1636 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1637 PetscCall(PetscLogGpuTimeEnd()); 1638 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1639 PetscFunctionReturn(PETSC_SUCCESS); 1640 } 1641 1642 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1643 { 1644 const PetscScalar *barray; 1645 PetscScalar *xarray; 1646 thrust::device_ptr<const PetscScalar> bGPU; 1647 thrust::device_ptr<PetscScalar> xGPU; 1648 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1649 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1650 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1651 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1652 1653 PetscFunctionBegin; 1654 /* Get the GPU pointers */ 1655 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1656 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1657 xGPU = thrust::device_pointer_cast(xarray); 1658 bGPU = thrust::device_pointer_cast(barray); 1659 1660 PetscCall(PetscLogGpuTimeBegin()); 1661 /* First, reorder with the row permutation */ 1662 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1663 1664 /* Next, solve L */ 1665 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1666 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1667 1668 /* Then, solve U */ 1669 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1670 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1671 1672 /* Last, reorder with the column permutation */ 1673 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1674 1675 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1676 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1677 PetscCall(PetscLogGpuTimeEnd()); 1678 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1679 PetscFunctionReturn(PETSC_SUCCESS); 1680 } 1681 1682 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1683 { 1684 const PetscScalar *barray; 1685 PetscScalar *xarray; 1686 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1687 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1688 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1689 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1690 1691 PetscFunctionBegin; 1692 /* Get the GPU pointers */ 1693 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1694 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1695 1696 PetscCall(PetscLogGpuTimeBegin()); 1697 /* First, solve L */ 1698 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1699 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1700 1701 /* Next, solve U */ 1702 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1703 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1704 1705 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1706 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1707 PetscCall(PetscLogGpuTimeEnd()); 1708 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1709 PetscFunctionReturn(PETSC_SUCCESS); 1710 } 1711 #endif 1712 1713 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1714 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1715 { 1716 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1717 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1718 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1719 CsrMatrix *Acsr; 1720 PetscInt m, nz; 1721 PetscBool flg; 1722 1723 PetscFunctionBegin; 1724 if (PetscDefined(USE_DEBUG)) { 1725 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1726 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1727 } 1728 1729 /* Copy A's value to fact */ 1730 m = fact->rmap->n; 1731 nz = aij->nz; 1732 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1733 Acsr = (CsrMatrix *)Acusp->mat->mat; 1734 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1735 1736 PetscCall(PetscLogGpuTimeBegin()); 1737 /* Factorize fact inplace */ 1738 if (m) 1739 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1740 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1741 if (PetscDefined(USE_DEBUG)) { 1742 int numerical_zero; 1743 cusparseStatus_t status; 1744 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1745 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1746 } 1747 1748 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1749 if (fs->updatedSpSVAnalysis) { 1750 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1751 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1752 } else 1753 #endif 1754 { 1755 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1756 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1757 */ 1758 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1759 1760 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1761 1762 fs->updatedSpSVAnalysis = PETSC_TRUE; 1763 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1764 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1765 } 1766 1767 fact->offloadmask = PETSC_OFFLOAD_GPU; 1768 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1769 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1770 fact->ops->matsolve = NULL; 1771 fact->ops->matsolvetranspose = NULL; 1772 PetscCall(PetscLogGpuTimeEnd()); 1773 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1774 PetscFunctionReturn(PETSC_SUCCESS); 1775 } 1776 1777 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1778 { 1779 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1780 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1781 PetscInt m, nz; 1782 1783 PetscFunctionBegin; 1784 if (PetscDefined(USE_DEBUG)) { 1785 PetscInt i; 1786 PetscBool flg, missing; 1787 1788 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1789 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1790 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1791 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1792 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1793 } 1794 1795 /* Free the old stale stuff */ 1796 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1797 1798 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1799 but they will not be used. Allocate them just for easy debugging. 1800 */ 1801 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1802 1803 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1804 fact->factortype = MAT_FACTOR_ILU; 1805 fact->info.factor_mallocs = 0; 1806 fact->info.fill_ratio_given = info->fill; 1807 fact->info.fill_ratio_needed = 1.0; 1808 1809 aij->row = NULL; 1810 aij->col = NULL; 1811 1812 /* ====================================================================== */ 1813 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1814 /* We'll do in-place factorization on fact */ 1815 /* ====================================================================== */ 1816 const int *Ai, *Aj; 1817 1818 m = fact->rmap->n; 1819 nz = aij->nz; 1820 1821 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1822 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1823 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1824 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1825 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1826 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1827 1828 /* ====================================================================== */ 1829 /* Create descriptors for M, L, U */ 1830 /* ====================================================================== */ 1831 cusparseFillMode_t fillMode; 1832 cusparseDiagType_t diagType; 1833 1834 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1835 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1836 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1837 1838 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1839 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1840 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1841 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1842 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1843 */ 1844 fillMode = CUSPARSE_FILL_MODE_LOWER; 1845 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1846 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1847 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1848 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1849 1850 fillMode = CUSPARSE_FILL_MODE_UPPER; 1851 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1852 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1853 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1854 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1855 1856 /* ========================================================================= */ 1857 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1858 /* ========================================================================= */ 1859 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1860 if (m) 1861 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1862 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1863 1864 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1865 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1866 1867 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1868 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1869 1870 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1871 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1872 1873 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1874 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1875 1876 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1877 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1878 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1879 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1880 */ 1881 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1882 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1883 fs->spsvBuffer_L = fs->factBuffer_M; 1884 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1885 } else { 1886 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1887 fs->spsvBuffer_U = fs->factBuffer_M; 1888 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1889 } 1890 1891 /* ========================================================================== */ 1892 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1893 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1894 /* ========================================================================== */ 1895 int structural_zero; 1896 cusparseStatus_t status; 1897 1898 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1899 if (m) 1900 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1901 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1902 if (PetscDefined(USE_DEBUG)) { 1903 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1904 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1905 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1906 } 1907 1908 /* Estimate FLOPs of the numeric factorization */ 1909 { 1910 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1911 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1912 PetscLogDouble flops = 0.0; 1913 1914 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1915 Ai = Aseq->i; 1916 Adiag = Aseq->diag; 1917 for (PetscInt i = 0; i < m; i++) { 1918 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1919 nzRow = Ai[i + 1] - Ai[i]; 1920 nzLeft = Adiag[i] - Ai[i]; 1921 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1922 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1923 */ 1924 nzLeft = (nzRow - 1) / 2; 1925 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1926 } 1927 } 1928 fs->numericFactFlops = flops; 1929 } 1930 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1931 PetscFunctionReturn(PETSC_SUCCESS); 1932 } 1933 1934 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1935 { 1936 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1937 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1938 const PetscScalar *barray; 1939 PetscScalar *xarray; 1940 1941 PetscFunctionBegin; 1942 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1943 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1944 PetscCall(PetscLogGpuTimeBegin()); 1945 1946 /* Solve L*y = b */ 1947 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1948 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1949 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1950 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1951 1952 /* Solve Lt*x = y */ 1953 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1954 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1955 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1956 1957 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1958 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1959 1960 PetscCall(PetscLogGpuTimeEnd()); 1961 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1962 PetscFunctionReturn(PETSC_SUCCESS); 1963 } 1964 1965 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1966 { 1967 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1968 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1969 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1970 CsrMatrix *Acsr; 1971 PetscInt m, nz; 1972 PetscBool flg; 1973 1974 PetscFunctionBegin; 1975 if (PetscDefined(USE_DEBUG)) { 1976 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1977 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1978 } 1979 1980 /* Copy A's value to fact */ 1981 m = fact->rmap->n; 1982 nz = aij->nz; 1983 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1984 Acsr = (CsrMatrix *)Acusp->mat->mat; 1985 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1986 1987 /* Factorize fact inplace */ 1988 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1989 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1990 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1991 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1992 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1993 */ 1994 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1995 if (PetscDefined(USE_DEBUG)) { 1996 int numerical_zero; 1997 cusparseStatus_t status; 1998 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1999 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 2000 } 2001 2002 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 2003 if (fs->updatedSpSVAnalysis) { 2004 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 2005 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 2006 } else 2007 #endif 2008 { 2009 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 2010 2011 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2012 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2013 */ 2014 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 2015 fs->updatedSpSVAnalysis = PETSC_TRUE; 2016 } 2017 2018 fact->offloadmask = PETSC_OFFLOAD_GPU; 2019 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2020 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2021 fact->ops->matsolve = NULL; 2022 fact->ops->matsolvetranspose = NULL; 2023 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2024 PetscFunctionReturn(PETSC_SUCCESS); 2025 } 2026 2027 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2028 { 2029 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2030 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2031 PetscInt m, nz; 2032 2033 PetscFunctionBegin; 2034 if (PetscDefined(USE_DEBUG)) { 2035 PetscInt i; 2036 PetscBool flg, missing; 2037 2038 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2039 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2040 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2041 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2042 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2043 } 2044 2045 /* Free the old stale stuff */ 2046 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2047 2048 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2049 but they will not be used. Allocate them just for easy debugging. 2050 */ 2051 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2052 2053 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2054 fact->factortype = MAT_FACTOR_ICC; 2055 fact->info.factor_mallocs = 0; 2056 fact->info.fill_ratio_given = info->fill; 2057 fact->info.fill_ratio_needed = 1.0; 2058 2059 aij->row = NULL; 2060 aij->col = NULL; 2061 2062 /* ====================================================================== */ 2063 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2064 /* We'll do in-place factorization on fact */ 2065 /* ====================================================================== */ 2066 const int *Ai, *Aj; 2067 2068 m = fact->rmap->n; 2069 nz = aij->nz; 2070 2071 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2072 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2073 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2074 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2075 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2076 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2077 2078 /* ====================================================================== */ 2079 /* Create mat descriptors for M, L */ 2080 /* ====================================================================== */ 2081 cusparseFillMode_t fillMode; 2082 cusparseDiagType_t diagType; 2083 2084 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2085 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2086 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2087 2088 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2089 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2090 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2091 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2092 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2093 */ 2094 fillMode = CUSPARSE_FILL_MODE_LOWER; 2095 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2096 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2097 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2098 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2099 2100 /* ========================================================================= */ 2101 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2102 /* ========================================================================= */ 2103 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2104 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2105 2106 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2107 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2108 2109 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2110 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2111 2112 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2113 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2114 2115 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2116 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2117 2118 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2119 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2120 */ 2121 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2122 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2123 fs->spsvBuffer_L = fs->factBuffer_M; 2124 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2125 } else { 2126 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2127 fs->spsvBuffer_Lt = fs->factBuffer_M; 2128 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2129 } 2130 2131 /* ========================================================================== */ 2132 /* Perform analysis of ic0 on M */ 2133 /* The lower triangular part of M has the same sparsity pattern as L */ 2134 /* ========================================================================== */ 2135 int structural_zero; 2136 cusparseStatus_t status; 2137 2138 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2139 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2140 if (PetscDefined(USE_DEBUG)) { 2141 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2142 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2143 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2144 } 2145 2146 /* Estimate FLOPs of the numeric factorization */ 2147 { 2148 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2149 PetscInt *Ai, nzRow, nzLeft; 2150 PetscLogDouble flops = 0.0; 2151 2152 Ai = Aseq->i; 2153 for (PetscInt i = 0; i < m; i++) { 2154 nzRow = Ai[i + 1] - Ai[i]; 2155 if (nzRow > 1) { 2156 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2157 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2158 */ 2159 nzLeft = (nzRow - 1) / 2; 2160 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2161 } 2162 } 2163 fs->numericFactFlops = flops; 2164 } 2165 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2166 PetscFunctionReturn(PETSC_SUCCESS); 2167 } 2168 #endif 2169 2170 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2171 { 2172 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2173 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2174 2175 PetscFunctionBegin; 2176 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2177 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2178 B->offloadmask = PETSC_OFFLOAD_CPU; 2179 2180 if (!cusparsestruct->use_cpu_solve) { 2181 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2182 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2183 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2184 #else 2185 /* determine which version of MatSolve needs to be used. */ 2186 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2187 IS isrow = b->row, iscol = b->col; 2188 PetscBool row_identity, col_identity; 2189 2190 PetscCall(ISIdentity(isrow, &row_identity)); 2191 PetscCall(ISIdentity(iscol, &col_identity)); 2192 if (row_identity && col_identity) { 2193 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2194 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2195 } else { 2196 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2197 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2198 } 2199 #endif 2200 } 2201 B->ops->matsolve = NULL; 2202 B->ops->matsolvetranspose = NULL; 2203 2204 /* get the triangular factors */ 2205 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2206 PetscFunctionReturn(PETSC_SUCCESS); 2207 } 2208 2209 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2210 { 2211 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2212 2213 PetscFunctionBegin; 2214 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2215 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2216 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2217 PetscFunctionReturn(PETSC_SUCCESS); 2218 } 2219 2220 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2221 { 2222 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2223 2224 PetscFunctionBegin; 2225 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2226 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2227 if (!info->factoronhost) { 2228 PetscCall(ISIdentity(isrow, &row_identity)); 2229 PetscCall(ISIdentity(iscol, &col_identity)); 2230 } 2231 if (!info->levels && row_identity && col_identity) { 2232 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2233 } else 2234 #endif 2235 { 2236 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2237 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2238 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2239 } 2240 PetscFunctionReturn(PETSC_SUCCESS); 2241 } 2242 2243 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2244 { 2245 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2246 2247 PetscFunctionBegin; 2248 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2249 PetscBool perm_identity = PETSC_FALSE; 2250 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2251 if (!info->levels && perm_identity) { 2252 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2253 } else 2254 #endif 2255 { 2256 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2257 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2258 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2259 } 2260 PetscFunctionReturn(PETSC_SUCCESS); 2261 } 2262 2263 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2264 { 2265 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2266 2267 PetscFunctionBegin; 2268 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2269 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2270 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2271 PetscFunctionReturn(PETSC_SUCCESS); 2272 } 2273 2274 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2275 { 2276 PetscFunctionBegin; 2277 *type = MATSOLVERCUSPARSE; 2278 PetscFunctionReturn(PETSC_SUCCESS); 2279 } 2280 2281 /*MC 2282 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2283 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2284 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2285 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2286 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2287 algorithms are not recommended. This class does NOT support direct solver operations. 2288 2289 Level: beginner 2290 2291 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2292 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2293 M*/ 2294 2295 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2296 { 2297 PetscInt n = A->rmap->n; 2298 2299 PetscFunctionBegin; 2300 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2301 PetscCall(MatSetSizes(*B, n, n, n, n)); 2302 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2303 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2304 2305 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2306 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2307 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2308 if (!A->boundtocpu) { 2309 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2310 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2311 } else { 2312 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2313 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2314 } 2315 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2316 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2317 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2318 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2319 if (!A->boundtocpu) { 2320 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2321 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2322 } else { 2323 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2324 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2325 } 2326 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2327 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2328 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2329 2330 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2331 (*B)->canuseordering = PETSC_TRUE; 2332 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2333 PetscFunctionReturn(PETSC_SUCCESS); 2334 } 2335 2336 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2337 { 2338 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2339 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2340 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2341 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2342 #endif 2343 2344 PetscFunctionBegin; 2345 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2346 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2347 if (A->factortype == MAT_FACTOR_NONE) { 2348 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2349 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2350 } 2351 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2352 else if (fs->csrVal) { 2353 /* We have a factorized matrix on device and are able to copy it to host */ 2354 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2355 } 2356 #endif 2357 else 2358 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2359 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2360 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2361 A->offloadmask = PETSC_OFFLOAD_BOTH; 2362 } 2363 PetscFunctionReturn(PETSC_SUCCESS); 2364 } 2365 2366 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2367 { 2368 PetscFunctionBegin; 2369 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2370 *array = ((Mat_SeqAIJ *)A->data)->a; 2371 PetscFunctionReturn(PETSC_SUCCESS); 2372 } 2373 2374 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2375 { 2376 PetscFunctionBegin; 2377 A->offloadmask = PETSC_OFFLOAD_CPU; 2378 *array = NULL; 2379 PetscFunctionReturn(PETSC_SUCCESS); 2380 } 2381 2382 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2383 { 2384 PetscFunctionBegin; 2385 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2386 *array = ((Mat_SeqAIJ *)A->data)->a; 2387 PetscFunctionReturn(PETSC_SUCCESS); 2388 } 2389 2390 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2391 { 2392 PetscFunctionBegin; 2393 *array = NULL; 2394 PetscFunctionReturn(PETSC_SUCCESS); 2395 } 2396 2397 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2398 { 2399 PetscFunctionBegin; 2400 *array = ((Mat_SeqAIJ *)A->data)->a; 2401 PetscFunctionReturn(PETSC_SUCCESS); 2402 } 2403 2404 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2405 { 2406 PetscFunctionBegin; 2407 A->offloadmask = PETSC_OFFLOAD_CPU; 2408 *array = NULL; 2409 PetscFunctionReturn(PETSC_SUCCESS); 2410 } 2411 2412 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2413 { 2414 Mat_SeqAIJCUSPARSE *cusp; 2415 CsrMatrix *matrix; 2416 2417 PetscFunctionBegin; 2418 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2419 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2420 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2421 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2422 matrix = (CsrMatrix *)cusp->mat->mat; 2423 2424 if (i) { 2425 #if !defined(PETSC_USE_64BIT_INDICES) 2426 *i = matrix->row_offsets->data().get(); 2427 #else 2428 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2429 #endif 2430 } 2431 if (j) { 2432 #if !defined(PETSC_USE_64BIT_INDICES) 2433 *j = matrix->column_indices->data().get(); 2434 #else 2435 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2436 #endif 2437 } 2438 if (a) *a = matrix->values->data().get(); 2439 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2440 PetscFunctionReturn(PETSC_SUCCESS); 2441 } 2442 2443 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2444 { 2445 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2446 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2447 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2448 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2449 cusparseStatus_t stat; 2450 PetscBool both = PETSC_TRUE; 2451 2452 PetscFunctionBegin; 2453 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2454 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2455 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2456 CsrMatrix *matrix; 2457 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2458 2459 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2460 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2461 matrix->values->assign(a->a, a->a + a->nz); 2462 PetscCallCUDA(WaitForCUDA()); 2463 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2464 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2465 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2466 } else { 2467 PetscInt nnz; 2468 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2469 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2470 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2471 delete cusparsestruct->workVector; 2472 delete cusparsestruct->rowoffsets_gpu; 2473 cusparsestruct->workVector = NULL; 2474 cusparsestruct->rowoffsets_gpu = NULL; 2475 try { 2476 if (a->compressedrow.use) { 2477 m = a->compressedrow.nrows; 2478 ii = a->compressedrow.i; 2479 ridx = a->compressedrow.rindex; 2480 } else { 2481 m = A->rmap->n; 2482 ii = a->i; 2483 ridx = NULL; 2484 } 2485 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2486 if (!a->a) { 2487 nnz = ii[m]; 2488 both = PETSC_FALSE; 2489 } else nnz = a->nz; 2490 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2491 2492 /* create cusparse matrix */ 2493 cusparsestruct->nrows = m; 2494 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2495 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2496 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2497 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2498 2499 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2500 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2501 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2502 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2503 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2504 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2505 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2506 2507 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2508 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2509 /* set the matrix */ 2510 CsrMatrix *mat = new CsrMatrix; 2511 mat->num_rows = m; 2512 mat->num_cols = A->cmap->n; 2513 mat->num_entries = nnz; 2514 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2515 mat->row_offsets->assign(ii, ii + m + 1); 2516 2517 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2518 mat->column_indices->assign(a->j, a->j + nnz); 2519 2520 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2521 if (a->a) mat->values->assign(a->a, a->a + nnz); 2522 2523 /* assign the pointer */ 2524 matstruct->mat = mat; 2525 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2526 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2527 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2528 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2529 PetscCallCUSPARSE(stat); 2530 } 2531 #endif 2532 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2533 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2534 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2535 #else 2536 CsrMatrix *mat = new CsrMatrix; 2537 mat->num_rows = m; 2538 mat->num_cols = A->cmap->n; 2539 mat->num_entries = nnz; 2540 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2541 mat->row_offsets->assign(ii, ii + m + 1); 2542 2543 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2544 mat->column_indices->assign(a->j, a->j + nnz); 2545 2546 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2547 if (a->a) mat->values->assign(a->a, a->a + nnz); 2548 2549 cusparseHybMat_t hybMat; 2550 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2551 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2552 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2553 PetscCallCUSPARSE(stat); 2554 /* assign the pointer */ 2555 matstruct->mat = hybMat; 2556 2557 if (mat) { 2558 if (mat->values) delete (THRUSTARRAY *)mat->values; 2559 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2560 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2561 delete (CsrMatrix *)mat; 2562 } 2563 #endif 2564 } 2565 2566 /* assign the compressed row indices */ 2567 if (a->compressedrow.use) { 2568 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2569 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2570 matstruct->cprowIndices->assign(ridx, ridx + m); 2571 tmp = m; 2572 } else { 2573 cusparsestruct->workVector = NULL; 2574 matstruct->cprowIndices = NULL; 2575 tmp = 0; 2576 } 2577 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2578 2579 /* assign the pointer */ 2580 cusparsestruct->mat = matstruct; 2581 } catch (char *ex) { 2582 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2583 } 2584 PetscCallCUDA(WaitForCUDA()); 2585 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2586 cusparsestruct->nonzerostate = A->nonzerostate; 2587 } 2588 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2589 } 2590 PetscFunctionReturn(PETSC_SUCCESS); 2591 } 2592 2593 struct VecCUDAPlusEquals { 2594 template <typename Tuple> 2595 __host__ __device__ void operator()(Tuple t) 2596 { 2597 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2598 } 2599 }; 2600 2601 struct VecCUDAEquals { 2602 template <typename Tuple> 2603 __host__ __device__ void operator()(Tuple t) 2604 { 2605 thrust::get<1>(t) = thrust::get<0>(t); 2606 } 2607 }; 2608 2609 struct VecCUDAEqualsReverse { 2610 template <typename Tuple> 2611 __host__ __device__ void operator()(Tuple t) 2612 { 2613 thrust::get<0>(t) = thrust::get<1>(t); 2614 } 2615 }; 2616 2617 struct MatMatCusparse { 2618 PetscBool cisdense; 2619 PetscScalar *Bt; 2620 Mat X; 2621 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2622 PetscLogDouble flops; 2623 CsrMatrix *Bcsr; 2624 2625 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2626 cusparseSpMatDescr_t matSpBDescr; 2627 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2628 cusparseDnMatDescr_t matBDescr; 2629 cusparseDnMatDescr_t matCDescr; 2630 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2632 void *dBuffer4; 2633 void *dBuffer5; 2634 #endif 2635 size_t mmBufferSize; 2636 void *mmBuffer; 2637 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2638 cusparseSpGEMMDescr_t spgemmDesc; 2639 #endif 2640 }; 2641 2642 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2643 { 2644 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2645 2646 PetscFunctionBegin; 2647 PetscCallCUDA(cudaFree(mmdata->Bt)); 2648 delete mmdata->Bcsr; 2649 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2650 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2651 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2652 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2653 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2654 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2655 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2656 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2657 #endif 2658 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2659 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2660 #endif 2661 PetscCall(MatDestroy(&mmdata->X)); 2662 PetscCall(PetscFree(data)); 2663 PetscFunctionReturn(PETSC_SUCCESS); 2664 } 2665 2666 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2667 2668 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2669 { 2670 Mat_Product *product = C->product; 2671 Mat A, B; 2672 PetscInt m, n, blda, clda; 2673 PetscBool flg, biscuda; 2674 Mat_SeqAIJCUSPARSE *cusp; 2675 cusparseStatus_t stat; 2676 cusparseOperation_t opA; 2677 const PetscScalar *barray; 2678 PetscScalar *carray; 2679 MatMatCusparse *mmdata; 2680 Mat_SeqAIJCUSPARSEMultStruct *mat; 2681 CsrMatrix *csrmat; 2682 2683 PetscFunctionBegin; 2684 MatCheckProduct(C, 1); 2685 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2686 mmdata = (MatMatCusparse *)product->data; 2687 A = product->A; 2688 B = product->B; 2689 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2690 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2691 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2692 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2693 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2694 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2695 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2696 switch (product->type) { 2697 case MATPRODUCT_AB: 2698 case MATPRODUCT_PtAP: 2699 mat = cusp->mat; 2700 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2701 m = A->rmap->n; 2702 n = B->cmap->n; 2703 break; 2704 case MATPRODUCT_AtB: 2705 if (!A->form_explicit_transpose) { 2706 mat = cusp->mat; 2707 opA = CUSPARSE_OPERATION_TRANSPOSE; 2708 } else { 2709 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2710 mat = cusp->matTranspose; 2711 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2712 } 2713 m = A->cmap->n; 2714 n = B->cmap->n; 2715 break; 2716 case MATPRODUCT_ABt: 2717 case MATPRODUCT_RARt: 2718 mat = cusp->mat; 2719 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2720 m = A->rmap->n; 2721 n = B->rmap->n; 2722 break; 2723 default: 2724 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2725 } 2726 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2727 csrmat = (CsrMatrix *)mat->mat; 2728 /* if the user passed a CPU matrix, copy the data to the GPU */ 2729 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2730 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2731 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2732 2733 PetscCall(MatDenseGetLDA(B, &blda)); 2734 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2735 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2736 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2737 } else { 2738 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2739 PetscCall(MatDenseGetLDA(C, &clda)); 2740 } 2741 2742 PetscCall(PetscLogGpuTimeBegin()); 2743 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2744 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2745 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2746 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2747 #else 2748 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2749 #endif 2750 2751 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2752 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2753 size_t mmBufferSize; 2754 if (mmdata->initialized && mmdata->Blda != blda) { 2755 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2756 mmdata->matBDescr = NULL; 2757 } 2758 if (!mmdata->matBDescr) { 2759 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2760 mmdata->Blda = blda; 2761 } 2762 2763 if (mmdata->initialized && mmdata->Clda != clda) { 2764 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2765 mmdata->matCDescr = NULL; 2766 } 2767 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2768 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2769 mmdata->Clda = clda; 2770 } 2771 2772 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2773 if (matADescr) { 2774 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2775 matADescr = NULL; 2776 } 2777 #endif 2778 2779 if (!matADescr) { 2780 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2781 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2782 PetscCallCUSPARSE(stat); 2783 } 2784 2785 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2786 2787 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2788 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2789 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2790 mmdata->mmBufferSize = mmBufferSize; 2791 } 2792 2793 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0 2794 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2795 #endif 2796 2797 mmdata->initialized = PETSC_TRUE; 2798 } else { 2799 /* to be safe, always update pointers of the mats */ 2800 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2801 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2802 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2803 } 2804 2805 /* do cusparseSpMM, which supports transpose on B */ 2806 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2807 #else 2808 PetscInt k; 2809 /* cusparseXcsrmm does not support transpose on B */ 2810 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2811 cublasHandle_t cublasv2handle; 2812 cublasStatus_t cerr; 2813 2814 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2815 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2816 PetscCallCUBLAS(cerr); 2817 blda = B->cmap->n; 2818 k = B->cmap->n; 2819 } else { 2820 k = B->rmap->n; 2821 } 2822 2823 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2824 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2825 PetscCallCUSPARSE(stat); 2826 #endif 2827 PetscCall(PetscLogGpuTimeEnd()); 2828 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2829 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2830 if (product->type == MATPRODUCT_RARt) { 2831 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2832 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2833 } else if (product->type == MATPRODUCT_PtAP) { 2834 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2835 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2836 } else { 2837 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2838 } 2839 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2840 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2841 PetscFunctionReturn(PETSC_SUCCESS); 2842 } 2843 2844 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2845 { 2846 Mat_Product *product = C->product; 2847 Mat A, B; 2848 PetscInt m, n; 2849 PetscBool cisdense, flg; 2850 MatMatCusparse *mmdata; 2851 Mat_SeqAIJCUSPARSE *cusp; 2852 2853 PetscFunctionBegin; 2854 MatCheckProduct(C, 1); 2855 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2856 A = product->A; 2857 B = product->B; 2858 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2859 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2860 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2861 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2862 switch (product->type) { 2863 case MATPRODUCT_AB: 2864 m = A->rmap->n; 2865 n = B->cmap->n; 2866 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2867 break; 2868 case MATPRODUCT_AtB: 2869 m = A->cmap->n; 2870 n = B->cmap->n; 2871 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 2872 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2873 break; 2874 case MATPRODUCT_ABt: 2875 m = A->rmap->n; 2876 n = B->rmap->n; 2877 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 2878 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2879 break; 2880 case MATPRODUCT_PtAP: 2881 m = B->cmap->n; 2882 n = B->cmap->n; 2883 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 2884 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2885 break; 2886 case MATPRODUCT_RARt: 2887 m = B->rmap->n; 2888 n = B->rmap->n; 2889 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 2890 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2891 break; 2892 default: 2893 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2894 } 2895 PetscCall(MatSetSizes(C, m, n, m, n)); 2896 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2897 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2898 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2899 2900 /* product data */ 2901 PetscCall(PetscNew(&mmdata)); 2902 mmdata->cisdense = cisdense; 2903 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2904 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2905 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2906 #endif 2907 /* for these products we need intermediate storage */ 2908 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2909 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2910 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2911 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2912 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2913 } else { 2914 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2915 } 2916 } 2917 C->product->data = mmdata; 2918 C->product->destroy = MatDestroy_MatMatCusparse; 2919 2920 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2921 PetscFunctionReturn(PETSC_SUCCESS); 2922 } 2923 2924 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2925 { 2926 Mat_Product *product = C->product; 2927 Mat A, B; 2928 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2929 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2930 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2931 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2932 PetscBool flg; 2933 cusparseStatus_t stat; 2934 MatProductType ptype; 2935 MatMatCusparse *mmdata; 2936 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2937 cusparseSpMatDescr_t BmatSpDescr; 2938 #endif 2939 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2940 2941 PetscFunctionBegin; 2942 MatCheckProduct(C, 1); 2943 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2944 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2945 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2946 mmdata = (MatMatCusparse *)C->product->data; 2947 A = product->A; 2948 B = product->B; 2949 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2950 mmdata->reusesym = PETSC_FALSE; 2951 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2952 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2953 Cmat = Ccusp->mat; 2954 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2955 Ccsr = (CsrMatrix *)Cmat->mat; 2956 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2957 goto finalize; 2958 } 2959 if (!c->nz) goto finalize; 2960 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2961 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2962 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2963 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2964 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2965 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2966 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2967 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2968 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2969 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2970 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2971 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2972 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2973 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2974 2975 ptype = product->type; 2976 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2977 ptype = MATPRODUCT_AB; 2978 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2979 } 2980 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2981 ptype = MATPRODUCT_AB; 2982 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2983 } 2984 switch (ptype) { 2985 case MATPRODUCT_AB: 2986 Amat = Acusp->mat; 2987 Bmat = Bcusp->mat; 2988 break; 2989 case MATPRODUCT_AtB: 2990 Amat = Acusp->matTranspose; 2991 Bmat = Bcusp->mat; 2992 break; 2993 case MATPRODUCT_ABt: 2994 Amat = Acusp->mat; 2995 Bmat = Bcusp->matTranspose; 2996 break; 2997 default: 2998 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2999 } 3000 Cmat = Ccusp->mat; 3001 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3002 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3003 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 3004 Acsr = (CsrMatrix *)Amat->mat; 3005 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 3006 Ccsr = (CsrMatrix *)Cmat->mat; 3007 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3008 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3009 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 3010 PetscCall(PetscLogGpuTimeBegin()); 3011 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3012 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 3013 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3014 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3015 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3016 PetscCallCUSPARSE(stat); 3017 #else 3018 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3019 PetscCallCUSPARSE(stat); 3020 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3021 PetscCallCUSPARSE(stat); 3022 #endif 3023 #else 3024 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3025 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3026 PetscCallCUSPARSE(stat); 3027 #endif 3028 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3029 PetscCallCUDA(WaitForCUDA()); 3030 PetscCall(PetscLogGpuTimeEnd()); 3031 C->offloadmask = PETSC_OFFLOAD_GPU; 3032 finalize: 3033 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3034 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3035 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3036 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3037 c->reallocs = 0; 3038 C->info.mallocs += 0; 3039 C->info.nz_unneeded = 0; 3040 C->assembled = C->was_assembled = PETSC_TRUE; 3041 C->num_ass++; 3042 PetscFunctionReturn(PETSC_SUCCESS); 3043 } 3044 3045 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3046 { 3047 Mat_Product *product = C->product; 3048 Mat A, B; 3049 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3050 Mat_SeqAIJ *a, *b, *c; 3051 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3052 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3053 PetscInt i, j, m, n, k; 3054 PetscBool flg; 3055 cusparseStatus_t stat; 3056 MatProductType ptype; 3057 MatMatCusparse *mmdata; 3058 PetscLogDouble flops; 3059 PetscBool biscompressed, ciscompressed; 3060 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3061 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3062 cusparseSpMatDescr_t BmatSpDescr; 3063 #else 3064 int cnz; 3065 #endif 3066 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3067 3068 PetscFunctionBegin; 3069 MatCheckProduct(C, 1); 3070 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3071 A = product->A; 3072 B = product->B; 3073 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3074 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3075 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3076 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3077 a = (Mat_SeqAIJ *)A->data; 3078 b = (Mat_SeqAIJ *)B->data; 3079 /* product data */ 3080 PetscCall(PetscNew(&mmdata)); 3081 C->product->data = mmdata; 3082 C->product->destroy = MatDestroy_MatMatCusparse; 3083 3084 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3085 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3086 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3087 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3088 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3089 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3090 3091 ptype = product->type; 3092 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3093 ptype = MATPRODUCT_AB; 3094 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3095 } 3096 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3097 ptype = MATPRODUCT_AB; 3098 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3099 } 3100 biscompressed = PETSC_FALSE; 3101 ciscompressed = PETSC_FALSE; 3102 switch (ptype) { 3103 case MATPRODUCT_AB: 3104 m = A->rmap->n; 3105 n = B->cmap->n; 3106 k = A->cmap->n; 3107 Amat = Acusp->mat; 3108 Bmat = Bcusp->mat; 3109 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3110 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3111 break; 3112 case MATPRODUCT_AtB: 3113 m = A->cmap->n; 3114 n = B->cmap->n; 3115 k = A->rmap->n; 3116 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3117 Amat = Acusp->matTranspose; 3118 Bmat = Bcusp->mat; 3119 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3120 break; 3121 case MATPRODUCT_ABt: 3122 m = A->rmap->n; 3123 n = B->rmap->n; 3124 k = A->cmap->n; 3125 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3126 Amat = Acusp->mat; 3127 Bmat = Bcusp->matTranspose; 3128 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3129 break; 3130 default: 3131 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3132 } 3133 3134 /* create cusparse matrix */ 3135 PetscCall(MatSetSizes(C, m, n, m, n)); 3136 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3137 c = (Mat_SeqAIJ *)C->data; 3138 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3139 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3140 Ccsr = new CsrMatrix; 3141 3142 c->compressedrow.use = ciscompressed; 3143 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3144 c->compressedrow.nrows = a->compressedrow.nrows; 3145 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3146 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3147 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3148 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3149 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3150 } else { 3151 c->compressedrow.nrows = 0; 3152 c->compressedrow.i = NULL; 3153 c->compressedrow.rindex = NULL; 3154 Ccusp->workVector = NULL; 3155 Cmat->cprowIndices = NULL; 3156 } 3157 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3158 Ccusp->mat = Cmat; 3159 Ccusp->mat->mat = Ccsr; 3160 Ccsr->num_rows = Ccusp->nrows; 3161 Ccsr->num_cols = n; 3162 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3163 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3164 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3165 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3166 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3167 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3168 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3169 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3170 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3171 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3172 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3173 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3174 c->nz = 0; 3175 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3176 Ccsr->values = new THRUSTARRAY(c->nz); 3177 goto finalizesym; 3178 } 3179 3180 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3181 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3182 Acsr = (CsrMatrix *)Amat->mat; 3183 if (!biscompressed) { 3184 Bcsr = (CsrMatrix *)Bmat->mat; 3185 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3186 BmatSpDescr = Bmat->matDescr; 3187 #endif 3188 } else { /* we need to use row offsets for the full matrix */ 3189 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3190 Bcsr = new CsrMatrix; 3191 Bcsr->num_rows = B->rmap->n; 3192 Bcsr->num_cols = cBcsr->num_cols; 3193 Bcsr->num_entries = cBcsr->num_entries; 3194 Bcsr->column_indices = cBcsr->column_indices; 3195 Bcsr->values = cBcsr->values; 3196 if (!Bcusp->rowoffsets_gpu) { 3197 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3198 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3199 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3200 } 3201 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3202 mmdata->Bcsr = Bcsr; 3203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3204 if (Bcsr->num_rows && Bcsr->num_cols) { 3205 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3206 PetscCallCUSPARSE(stat); 3207 } 3208 BmatSpDescr = mmdata->matSpBDescr; 3209 #endif 3210 } 3211 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3212 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3213 /* precompute flops count */ 3214 if (ptype == MATPRODUCT_AB) { 3215 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3216 const PetscInt st = a->i[i]; 3217 const PetscInt en = a->i[i + 1]; 3218 for (j = st; j < en; j++) { 3219 const PetscInt brow = a->j[j]; 3220 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3221 } 3222 } 3223 } else if (ptype == MATPRODUCT_AtB) { 3224 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3225 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3226 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3227 flops += (2. * anzi) * bnzi; 3228 } 3229 } else { /* TODO */ 3230 flops = 0.; 3231 } 3232 3233 mmdata->flops = flops; 3234 PetscCall(PetscLogGpuTimeBegin()); 3235 3236 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3237 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3238 // cuda-12.2 requires non-null csrRowOffsets 3239 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3240 PetscCallCUSPARSE(stat); 3241 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3242 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3243 { 3244 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3245 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3246 */ 3247 void *dBuffer1 = NULL; 3248 void *dBuffer2 = NULL; 3249 void *dBuffer3 = NULL; 3250 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3251 size_t bufferSize1 = 0; 3252 size_t bufferSize2 = 0; 3253 size_t bufferSize3 = 0; 3254 size_t bufferSize4 = 0; 3255 size_t bufferSize5 = 0; 3256 3257 /* ask bufferSize1 bytes for external memory */ 3258 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3259 PetscCallCUSPARSE(stat); 3260 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3261 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3262 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3263 PetscCallCUSPARSE(stat); 3264 3265 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3266 PetscCallCUSPARSE(stat); 3267 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3268 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3269 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3270 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3271 PetscCallCUSPARSE(stat); 3272 PetscCallCUDA(cudaFree(dBuffer1)); 3273 PetscCallCUDA(cudaFree(dBuffer2)); 3274 3275 /* get matrix C non-zero entries C_nnz1 */ 3276 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3277 c->nz = (PetscInt)C_nnz1; 3278 /* allocate matrix C */ 3279 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3280 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3281 Ccsr->values = new THRUSTARRAY(c->nz); 3282 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3283 /* update matC with the new pointers */ 3284 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3285 PetscCallCUSPARSE(stat); 3286 3287 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3288 PetscCallCUSPARSE(stat); 3289 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3290 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3291 PetscCallCUSPARSE(stat); 3292 PetscCallCUDA(cudaFree(dBuffer3)); 3293 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3294 PetscCallCUSPARSE(stat); 3295 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3296 } 3297 #else 3298 size_t bufSize2; 3299 /* ask bufferSize bytes for external memory */ 3300 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3301 PetscCallCUSPARSE(stat); 3302 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3303 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3304 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3305 PetscCallCUSPARSE(stat); 3306 /* ask bufferSize again bytes for external memory */ 3307 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3308 PetscCallCUSPARSE(stat); 3309 /* The CUSPARSE documentation is not clear, nor the API 3310 We need both buffers to perform the operations properly! 3311 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3312 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3313 is stored in the descriptor! What a messy API... */ 3314 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3315 /* compute the intermediate product of A * B */ 3316 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3317 PetscCallCUSPARSE(stat); 3318 /* get matrix C non-zero entries C_nnz1 */ 3319 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3320 c->nz = (PetscInt)C_nnz1; 3321 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3322 mmdata->mmBufferSize / 1024)); 3323 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3324 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3325 Ccsr->values = new THRUSTARRAY(c->nz); 3326 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3327 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3328 PetscCallCUSPARSE(stat); 3329 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3330 PetscCallCUSPARSE(stat); 3331 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3332 #else 3333 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3334 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3335 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3336 PetscCallCUSPARSE(stat); 3337 c->nz = cnz; 3338 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3339 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3340 Ccsr->values = new THRUSTARRAY(c->nz); 3341 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3342 3343 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3344 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3345 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3346 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3347 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3348 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3349 PetscCallCUSPARSE(stat); 3350 #endif 3351 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3352 PetscCall(PetscLogGpuTimeEnd()); 3353 finalizesym: 3354 c->free_a = PETSC_TRUE; 3355 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3356 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3357 c->free_ij = PETSC_TRUE; 3358 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3359 PetscInt *d_i = c->i; 3360 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3361 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3362 ii = *Ccsr->row_offsets; 3363 jj = *Ccsr->column_indices; 3364 if (ciscompressed) d_i = c->compressedrow.i; 3365 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3366 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3367 } else { 3368 PetscInt *d_i = c->i; 3369 if (ciscompressed) d_i = c->compressedrow.i; 3370 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3371 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3372 } 3373 if (ciscompressed) { /* need to expand host row offsets */ 3374 PetscInt r = 0; 3375 c->i[0] = 0; 3376 for (k = 0; k < c->compressedrow.nrows; k++) { 3377 const PetscInt next = c->compressedrow.rindex[k]; 3378 const PetscInt old = c->compressedrow.i[k]; 3379 for (; r < next; r++) c->i[r + 1] = old; 3380 } 3381 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3382 } 3383 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3384 PetscCall(PetscMalloc1(m, &c->ilen)); 3385 PetscCall(PetscMalloc1(m, &c->imax)); 3386 c->maxnz = c->nz; 3387 c->nonzerorowcnt = 0; 3388 c->rmax = 0; 3389 for (k = 0; k < m; k++) { 3390 const PetscInt nn = c->i[k + 1] - c->i[k]; 3391 c->ilen[k] = c->imax[k] = nn; 3392 c->nonzerorowcnt += (PetscInt)!!nn; 3393 c->rmax = PetscMax(c->rmax, nn); 3394 } 3395 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3396 PetscCall(PetscMalloc1(c->nz, &c->a)); 3397 Ccsr->num_entries = c->nz; 3398 3399 C->nonzerostate++; 3400 PetscCall(PetscLayoutSetUp(C->rmap)); 3401 PetscCall(PetscLayoutSetUp(C->cmap)); 3402 Ccusp->nonzerostate = C->nonzerostate; 3403 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3404 C->preallocated = PETSC_TRUE; 3405 C->assembled = PETSC_FALSE; 3406 C->was_assembled = PETSC_FALSE; 3407 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3408 mmdata->reusesym = PETSC_TRUE; 3409 C->offloadmask = PETSC_OFFLOAD_GPU; 3410 } 3411 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3412 PetscFunctionReturn(PETSC_SUCCESS); 3413 } 3414 3415 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3416 3417 /* handles sparse or dense B */ 3418 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3419 { 3420 Mat_Product *product = mat->product; 3421 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3422 3423 PetscFunctionBegin; 3424 MatCheckProduct(mat, 1); 3425 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3426 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3427 if (product->type == MATPRODUCT_ABC) { 3428 Ciscusp = PETSC_FALSE; 3429 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3430 } 3431 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3432 PetscBool usecpu = PETSC_FALSE; 3433 switch (product->type) { 3434 case MATPRODUCT_AB: 3435 if (product->api_user) { 3436 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3437 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3438 PetscOptionsEnd(); 3439 } else { 3440 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3441 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3442 PetscOptionsEnd(); 3443 } 3444 break; 3445 case MATPRODUCT_AtB: 3446 if (product->api_user) { 3447 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3448 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3449 PetscOptionsEnd(); 3450 } else { 3451 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3452 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3453 PetscOptionsEnd(); 3454 } 3455 break; 3456 case MATPRODUCT_PtAP: 3457 if (product->api_user) { 3458 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3459 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3460 PetscOptionsEnd(); 3461 } else { 3462 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3463 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3464 PetscOptionsEnd(); 3465 } 3466 break; 3467 case MATPRODUCT_RARt: 3468 if (product->api_user) { 3469 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3470 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3471 PetscOptionsEnd(); 3472 } else { 3473 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3474 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3475 PetscOptionsEnd(); 3476 } 3477 break; 3478 case MATPRODUCT_ABC: 3479 if (product->api_user) { 3480 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3481 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3482 PetscOptionsEnd(); 3483 } else { 3484 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3485 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3486 PetscOptionsEnd(); 3487 } 3488 break; 3489 default: 3490 break; 3491 } 3492 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3493 } 3494 /* dispatch */ 3495 if (isdense) { 3496 switch (product->type) { 3497 case MATPRODUCT_AB: 3498 case MATPRODUCT_AtB: 3499 case MATPRODUCT_ABt: 3500 case MATPRODUCT_PtAP: 3501 case MATPRODUCT_RARt: 3502 if (product->A->boundtocpu) { 3503 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3504 } else { 3505 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3506 } 3507 break; 3508 case MATPRODUCT_ABC: 3509 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3510 break; 3511 default: 3512 break; 3513 } 3514 } else if (Biscusp && Ciscusp) { 3515 switch (product->type) { 3516 case MATPRODUCT_AB: 3517 case MATPRODUCT_AtB: 3518 case MATPRODUCT_ABt: 3519 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3520 break; 3521 case MATPRODUCT_PtAP: 3522 case MATPRODUCT_RARt: 3523 case MATPRODUCT_ABC: 3524 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3525 break; 3526 default: 3527 break; 3528 } 3529 } else { /* fallback for AIJ */ 3530 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3531 } 3532 PetscFunctionReturn(PETSC_SUCCESS); 3533 } 3534 3535 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3536 { 3537 PetscFunctionBegin; 3538 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3539 PetscFunctionReturn(PETSC_SUCCESS); 3540 } 3541 3542 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3543 { 3544 PetscFunctionBegin; 3545 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3546 PetscFunctionReturn(PETSC_SUCCESS); 3547 } 3548 3549 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3550 { 3551 PetscFunctionBegin; 3552 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3553 PetscFunctionReturn(PETSC_SUCCESS); 3554 } 3555 3556 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3557 { 3558 PetscFunctionBegin; 3559 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3560 PetscFunctionReturn(PETSC_SUCCESS); 3561 } 3562 3563 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3564 { 3565 PetscFunctionBegin; 3566 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3567 PetscFunctionReturn(PETSC_SUCCESS); 3568 } 3569 3570 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3571 { 3572 int i = blockIdx.x * blockDim.x + threadIdx.x; 3573 if (i < n) y[idx[i]] += x[i]; 3574 } 3575 3576 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3577 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3578 { 3579 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3580 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3581 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3582 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3583 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3584 PetscBool compressed; 3585 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3586 PetscInt nx, ny; 3587 #endif 3588 3589 PetscFunctionBegin; 3590 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3591 if (!a->nz) { 3592 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3593 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3594 PetscFunctionReturn(PETSC_SUCCESS); 3595 } 3596 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3597 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3598 if (!trans) { 3599 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3600 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3601 } else { 3602 if (herm || !A->form_explicit_transpose) { 3603 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3604 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3605 } else { 3606 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3607 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3608 } 3609 } 3610 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3611 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3612 3613 try { 3614 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3615 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3616 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3617 3618 PetscCall(PetscLogGpuTimeBegin()); 3619 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3620 /* z = A x + beta y. 3621 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3622 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3623 */ 3624 xptr = xarray; 3625 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3626 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3627 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3628 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3629 allocated to accommodate different uses. So we get the length info directly from mat. 3630 */ 3631 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3632 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3633 nx = mat->num_cols; // since y = Ax 3634 ny = mat->num_rows; 3635 } 3636 #endif 3637 } else { 3638 /* z = A^T x + beta y 3639 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3640 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3641 */ 3642 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3643 dptr = zarray; 3644 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3645 if (compressed) { /* Scatter x to work vector */ 3646 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3647 3648 thrust::for_each( 3649 #if PetscDefined(HAVE_THRUST_ASYNC) 3650 thrust::cuda::par.on(PetscDefaultCudaStream), 3651 #endif 3652 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3653 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3654 } 3655 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3656 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3657 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3658 nx = mat->num_rows; // since y = A^T x 3659 ny = mat->num_cols; 3660 } 3661 #endif 3662 } 3663 3664 /* csr_spmv does y = alpha op(A) x + beta y */ 3665 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3666 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3667 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3668 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3669 #else 3670 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3671 #endif 3672 3673 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3674 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3675 if (!matDescr) { 3676 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3677 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3678 } 3679 #endif 3680 3681 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3682 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3683 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3684 PetscCallCUSPARSE( 3685 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3686 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3687 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3688 PetscCallCUSPARSE( 3689 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3690 #endif 3691 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3692 } else { 3693 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3694 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3695 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3696 } 3697 3698 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3699 #else 3700 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3701 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3702 #endif 3703 } else { 3704 if (cusparsestruct->nrows) { 3705 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3706 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3707 #else 3708 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3709 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3710 #endif 3711 } 3712 } 3713 PetscCall(PetscLogGpuTimeEnd()); 3714 3715 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3716 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3717 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3718 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3719 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3720 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3721 } 3722 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3723 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3724 } 3725 3726 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3727 if (compressed) { 3728 PetscCall(PetscLogGpuTimeBegin()); 3729 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3730 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3731 prevent that. So I just add a ScatterAdd kernel. 3732 */ 3733 #if 0 3734 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3735 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3736 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3737 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3738 VecCUDAPlusEquals()); 3739 #else 3740 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3741 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3742 #endif 3743 PetscCall(PetscLogGpuTimeEnd()); 3744 } 3745 } else { 3746 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3747 } 3748 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3749 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3750 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3751 } catch (char *ex) { 3752 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3753 } 3754 if (yy) { 3755 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3756 } else { 3757 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3758 } 3759 PetscFunctionReturn(PETSC_SUCCESS); 3760 } 3761 3762 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3763 { 3764 PetscFunctionBegin; 3765 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3766 PetscFunctionReturn(PETSC_SUCCESS); 3767 } 3768 3769 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3770 { 3771 PetscFunctionBegin; 3772 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3773 PetscFunctionReturn(PETSC_SUCCESS); 3774 } 3775 3776 /*@ 3777 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3778 (the default parallel PETSc format). 3779 3780 Collective 3781 3782 Input Parameters: 3783 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3784 . m - number of rows 3785 . n - number of columns 3786 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3787 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3788 3789 Output Parameter: 3790 . A - the matrix 3791 3792 Level: intermediate 3793 3794 Notes: 3795 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3796 calculations. For good matrix assembly performance the user should preallocate the matrix 3797 storage by setting the parameter `nz` (or the array `nnz`). 3798 3799 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3800 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3801 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3802 3803 The AIJ format, also called 3804 compressed row storage, is fully compatible with standard Fortran 3805 storage. That is, the stored row and column indices can begin at 3806 either one (as in Fortran) or zero. 3807 3808 Specify the preallocated storage with either nz or nnz (not both). 3809 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3810 allocation. 3811 3812 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 3813 @*/ 3814 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3815 { 3816 PetscFunctionBegin; 3817 PetscCall(MatCreate(comm, A)); 3818 PetscCall(MatSetSizes(*A, m, n, m, n)); 3819 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3820 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3821 PetscFunctionReturn(PETSC_SUCCESS); 3822 } 3823 3824 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3825 { 3826 PetscFunctionBegin; 3827 if (A->factortype == MAT_FACTOR_NONE) { 3828 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3829 } else { 3830 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3831 } 3832 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3833 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3834 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3835 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3836 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3837 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3838 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3839 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3840 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3841 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3842 PetscCall(MatDestroy_SeqAIJ(A)); 3843 PetscFunctionReturn(PETSC_SUCCESS); 3844 } 3845 3846 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3847 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3848 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3849 { 3850 PetscFunctionBegin; 3851 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3852 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3853 PetscFunctionReturn(PETSC_SUCCESS); 3854 } 3855 3856 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3857 { 3858 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3859 Mat_SeqAIJCUSPARSE *cy; 3860 Mat_SeqAIJCUSPARSE *cx; 3861 PetscScalar *ay; 3862 const PetscScalar *ax; 3863 CsrMatrix *csry, *csrx; 3864 3865 PetscFunctionBegin; 3866 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3867 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3868 if (X->ops->axpy != Y->ops->axpy) { 3869 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3870 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3871 PetscFunctionReturn(PETSC_SUCCESS); 3872 } 3873 /* if we are here, it means both matrices are bound to GPU */ 3874 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3875 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3876 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3877 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3878 csry = (CsrMatrix *)cy->mat->mat; 3879 csrx = (CsrMatrix *)cx->mat->mat; 3880 /* see if we can turn this into a cublas axpy */ 3881 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3882 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3883 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3884 if (eq) str = SAME_NONZERO_PATTERN; 3885 } 3886 /* spgeam is buggy with one column */ 3887 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3888 3889 if (str == SUBSET_NONZERO_PATTERN) { 3890 PetscScalar b = 1.0; 3891 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3892 size_t bufferSize; 3893 void *buffer; 3894 #endif 3895 3896 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3897 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3898 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3899 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3900 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3901 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3902 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3903 PetscCall(PetscLogGpuTimeBegin()); 3904 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3905 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3906 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3907 PetscCall(PetscLogGpuTimeEnd()); 3908 PetscCallCUDA(cudaFree(buffer)); 3909 #else 3910 PetscCall(PetscLogGpuTimeBegin()); 3911 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3912 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3913 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3914 PetscCall(PetscLogGpuTimeEnd()); 3915 #endif 3916 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3917 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3918 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3919 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3920 } else if (str == SAME_NONZERO_PATTERN) { 3921 cublasHandle_t cublasv2handle; 3922 PetscBLASInt one = 1, bnz = 1; 3923 3924 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3925 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3926 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3927 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3928 PetscCall(PetscLogGpuTimeBegin()); 3929 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3930 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3931 PetscCall(PetscLogGpuTimeEnd()); 3932 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3933 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3934 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3935 } else { 3936 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3937 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3938 } 3939 PetscFunctionReturn(PETSC_SUCCESS); 3940 } 3941 3942 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3943 { 3944 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3945 PetscScalar *ay; 3946 cublasHandle_t cublasv2handle; 3947 PetscBLASInt one = 1, bnz = 1; 3948 3949 PetscFunctionBegin; 3950 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3951 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3952 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3953 PetscCall(PetscLogGpuTimeBegin()); 3954 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3955 PetscCall(PetscLogGpuFlops(bnz)); 3956 PetscCall(PetscLogGpuTimeEnd()); 3957 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3958 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3959 PetscFunctionReturn(PETSC_SUCCESS); 3960 } 3961 3962 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3963 { 3964 PetscBool both = PETSC_FALSE; 3965 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3966 3967 PetscFunctionBegin; 3968 if (A->factortype == MAT_FACTOR_NONE) { 3969 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3970 if (spptr->mat) { 3971 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3972 if (matrix->values) { 3973 both = PETSC_TRUE; 3974 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3975 } 3976 } 3977 if (spptr->matTranspose) { 3978 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3979 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3980 } 3981 } 3982 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3983 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3984 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3985 else A->offloadmask = PETSC_OFFLOAD_CPU; 3986 PetscFunctionReturn(PETSC_SUCCESS); 3987 } 3988 3989 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3990 { 3991 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3992 3993 PetscFunctionBegin; 3994 if (A->factortype != MAT_FACTOR_NONE) { 3995 A->boundtocpu = flg; 3996 PetscFunctionReturn(PETSC_SUCCESS); 3997 } 3998 if (flg) { 3999 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4000 4001 A->ops->scale = MatScale_SeqAIJ; 4002 A->ops->axpy = MatAXPY_SeqAIJ; 4003 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4004 A->ops->mult = MatMult_SeqAIJ; 4005 A->ops->multadd = MatMultAdd_SeqAIJ; 4006 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4007 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4008 A->ops->multhermitiantranspose = NULL; 4009 A->ops->multhermitiantransposeadd = NULL; 4010 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4011 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 4012 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 4013 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 4014 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 4015 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 4016 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 4017 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 4018 } else { 4019 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4020 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4021 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4022 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4023 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4024 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4025 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4026 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4027 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4028 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4029 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4030 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4031 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4032 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4033 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4034 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4035 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4036 4037 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4038 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4039 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4040 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4041 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4042 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4043 } 4044 A->boundtocpu = flg; 4045 if (flg && a->inode.size_csr) { 4046 a->inode.use = PETSC_TRUE; 4047 } else { 4048 a->inode.use = PETSC_FALSE; 4049 } 4050 PetscFunctionReturn(PETSC_SUCCESS); 4051 } 4052 4053 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4054 { 4055 Mat B; 4056 4057 PetscFunctionBegin; 4058 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4059 if (reuse == MAT_INITIAL_MATRIX) { 4060 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4061 } else if (reuse == MAT_REUSE_MATRIX) { 4062 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4063 } 4064 B = *newmat; 4065 4066 PetscCall(PetscFree(B->defaultvectype)); 4067 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4068 4069 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4070 if (B->factortype == MAT_FACTOR_NONE) { 4071 Mat_SeqAIJCUSPARSE *spptr; 4072 PetscCall(PetscNew(&spptr)); 4073 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4074 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4075 spptr->format = MAT_CUSPARSE_CSR; 4076 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4077 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4078 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4079 #else 4080 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4081 #endif 4082 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4083 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4084 #endif 4085 B->spptr = spptr; 4086 } else { 4087 Mat_SeqAIJCUSPARSETriFactors *spptr; 4088 4089 PetscCall(PetscNew(&spptr)); 4090 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4091 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4092 B->spptr = spptr; 4093 } 4094 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4095 } 4096 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4097 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4098 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4099 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4100 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4101 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4102 4103 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4104 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4105 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4106 #if defined(PETSC_HAVE_HYPRE) 4107 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4108 #endif 4109 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4110 PetscFunctionReturn(PETSC_SUCCESS); 4111 } 4112 4113 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4114 { 4115 PetscFunctionBegin; 4116 PetscCall(MatCreate_SeqAIJ(B)); 4117 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4118 PetscFunctionReturn(PETSC_SUCCESS); 4119 } 4120 4121 /*MC 4122 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4123 4124 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 4125 CSR, ELL, or Hybrid format. 4126 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4127 4128 Options Database Keys: 4129 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4130 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4131 Other options include ell (ellpack) or hyb (hybrid). 4132 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4133 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4134 4135 Level: beginner 4136 4137 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4138 M*/ 4139 4140 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4141 { 4142 PetscFunctionBegin; 4143 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4144 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4145 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4146 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4147 PetscFunctionReturn(PETSC_SUCCESS); 4148 } 4149 4150 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4151 { 4152 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4153 4154 PetscFunctionBegin; 4155 if (cusp) { 4156 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4157 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4158 delete cusp->workVector; 4159 delete cusp->rowoffsets_gpu; 4160 delete cusp->csr2csc_i; 4161 delete cusp->coords; 4162 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4163 PetscCall(PetscFree(mat->spptr)); 4164 } 4165 PetscFunctionReturn(PETSC_SUCCESS); 4166 } 4167 4168 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4169 { 4170 PetscFunctionBegin; 4171 if (*mat) { 4172 delete (*mat)->values; 4173 delete (*mat)->column_indices; 4174 delete (*mat)->row_offsets; 4175 delete *mat; 4176 *mat = 0; 4177 } 4178 PetscFunctionReturn(PETSC_SUCCESS); 4179 } 4180 4181 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4182 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4183 { 4184 PetscFunctionBegin; 4185 if (*trifactor) { 4186 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4187 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4188 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4189 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4190 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4191 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4192 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4193 #endif 4194 PetscCall(PetscFree(*trifactor)); 4195 } 4196 PetscFunctionReturn(PETSC_SUCCESS); 4197 } 4198 #endif 4199 4200 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4201 { 4202 CsrMatrix *mat; 4203 4204 PetscFunctionBegin; 4205 if (*matstruct) { 4206 if ((*matstruct)->mat) { 4207 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4208 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4209 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4210 #else 4211 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4212 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4213 #endif 4214 } else { 4215 mat = (CsrMatrix *)(*matstruct)->mat; 4216 PetscCall(CsrMatrix_Destroy(&mat)); 4217 } 4218 } 4219 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4220 delete (*matstruct)->cprowIndices; 4221 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4222 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4223 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4224 4225 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4226 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4227 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4228 4229 for (int i = 0; i < 3; i++) { 4230 if (mdata->cuSpMV[i].initialized) { 4231 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4232 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4233 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4234 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4235 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4236 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4237 #endif 4238 } 4239 } 4240 #endif 4241 delete *matstruct; 4242 *matstruct = NULL; 4243 } 4244 PetscFunctionReturn(PETSC_SUCCESS); 4245 } 4246 4247 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4248 { 4249 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4250 4251 PetscFunctionBegin; 4252 if (fs) { 4253 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4254 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4255 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4256 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4257 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4258 delete fs->workVector; 4259 fs->workVector = NULL; 4260 #endif 4261 delete fs->rpermIndices; 4262 delete fs->cpermIndices; 4263 fs->rpermIndices = NULL; 4264 fs->cpermIndices = NULL; 4265 fs->init_dev_prop = PETSC_FALSE; 4266 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4267 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4268 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4269 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4270 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4271 PetscCallCUDA(cudaFree(fs->csrVal)); 4272 PetscCallCUDA(cudaFree(fs->diag)); 4273 PetscCallCUDA(cudaFree(fs->X)); 4274 PetscCallCUDA(cudaFree(fs->Y)); 4275 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4276 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4277 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4278 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4279 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4280 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4281 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4282 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4283 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4284 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4285 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4286 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4287 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4288 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4289 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4290 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4291 PetscCall(PetscFree(fs->csrRowPtr_h)); 4292 PetscCall(PetscFree(fs->csrVal_h)); 4293 PetscCall(PetscFree(fs->diag_h)); 4294 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4295 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4296 #endif 4297 } 4298 PetscFunctionReturn(PETSC_SUCCESS); 4299 } 4300 4301 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4302 { 4303 PetscFunctionBegin; 4304 if (*trifactors) { 4305 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4306 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4307 PetscCall(PetscFree(*trifactors)); 4308 } 4309 PetscFunctionReturn(PETSC_SUCCESS); 4310 } 4311 4312 struct IJCompare { 4313 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4314 { 4315 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4316 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4317 return false; 4318 } 4319 }; 4320 4321 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4322 { 4323 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4324 4325 PetscFunctionBegin; 4326 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4327 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4328 if (destroy) { 4329 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4330 delete cusp->csr2csc_i; 4331 cusp->csr2csc_i = NULL; 4332 } 4333 A->transupdated = PETSC_FALSE; 4334 PetscFunctionReturn(PETSC_SUCCESS); 4335 } 4336 4337 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4338 { 4339 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 4340 4341 PetscFunctionBegin; 4342 PetscCallCUDA(cudaFree(coo->perm)); 4343 PetscCallCUDA(cudaFree(coo->jmap)); 4344 PetscCall(PetscFree(coo)); 4345 PetscFunctionReturn(PETSC_SUCCESS); 4346 } 4347 4348 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4349 { 4350 PetscBool dev_ij = PETSC_FALSE; 4351 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4352 PetscInt *i, *j; 4353 PetscContainer container_h; 4354 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4355 4356 PetscFunctionBegin; 4357 PetscCall(PetscGetMemType(coo_i, &mtype)); 4358 if (PetscMemTypeDevice(mtype)) { 4359 dev_ij = PETSC_TRUE; 4360 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4361 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4362 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4363 } else { 4364 i = coo_i; 4365 j = coo_j; 4366 } 4367 4368 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4369 if (dev_ij) PetscCall(PetscFree2(i, j)); 4370 mat->offloadmask = PETSC_OFFLOAD_CPU; 4371 // Create the GPU memory 4372 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4373 4374 // Copy the COO struct to device 4375 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4376 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4377 PetscCall(PetscMalloc1(1, &coo_d)); 4378 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4379 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4380 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4381 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4382 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4383 4384 // Put the COO struct in a container and then attach that to the matrix 4385 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4386 PetscFunctionReturn(PETSC_SUCCESS); 4387 } 4388 4389 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4390 { 4391 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4392 const PetscCount grid_size = gridDim.x * blockDim.x; 4393 for (; i < nnz; i += grid_size) { 4394 PetscScalar sum = 0.0; 4395 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4396 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4397 } 4398 } 4399 4400 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4401 { 4402 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4403 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4404 PetscCount Annz = seq->nz; 4405 PetscMemType memtype; 4406 const PetscScalar *v1 = v; 4407 PetscScalar *Aa; 4408 PetscContainer container; 4409 MatCOOStruct_SeqAIJ *coo; 4410 4411 PetscFunctionBegin; 4412 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4413 4414 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4415 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4416 4417 PetscCall(PetscGetMemType(v, &memtype)); 4418 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4419 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4420 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4421 } 4422 4423 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4424 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4425 4426 PetscCall(PetscLogGpuTimeBegin()); 4427 if (Annz) { 4428 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4429 PetscCallCUDA(cudaPeekAtLastError()); 4430 } 4431 PetscCall(PetscLogGpuTimeEnd()); 4432 4433 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4434 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4435 4436 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4437 PetscFunctionReturn(PETSC_SUCCESS); 4438 } 4439 4440 /*@C 4441 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4442 4443 Not Collective 4444 4445 Input Parameters: 4446 + A - the matrix 4447 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4448 4449 Output Parameters: 4450 + i - the CSR row pointers 4451 - j - the CSR column indices 4452 4453 Level: developer 4454 4455 Note: 4456 When compressed is true, the CSR structure does not contain empty rows 4457 4458 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4459 @*/ 4460 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4461 { 4462 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4463 CsrMatrix *csr; 4464 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4465 4466 PetscFunctionBegin; 4467 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4468 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4469 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4470 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4471 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4472 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4473 csr = (CsrMatrix *)cusp->mat->mat; 4474 if (i) { 4475 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4476 if (!cusp->rowoffsets_gpu) { 4477 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4478 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4479 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4480 } 4481 *i = cusp->rowoffsets_gpu->data().get(); 4482 } else *i = csr->row_offsets->data().get(); 4483 } 4484 if (j) *j = csr->column_indices->data().get(); 4485 PetscFunctionReturn(PETSC_SUCCESS); 4486 } 4487 4488 /*@C 4489 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4490 4491 Not Collective 4492 4493 Input Parameters: 4494 + A - the matrix 4495 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4496 . i - the CSR row pointers 4497 - j - the CSR column indices 4498 4499 Level: developer 4500 4501 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4502 @*/ 4503 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4504 { 4505 PetscFunctionBegin; 4506 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4507 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4508 if (i) *i = NULL; 4509 if (j) *j = NULL; 4510 (void)compressed; 4511 PetscFunctionReturn(PETSC_SUCCESS); 4512 } 4513 4514 /*@C 4515 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4516 4517 Not Collective 4518 4519 Input Parameter: 4520 . A - a `MATSEQAIJCUSPARSE` matrix 4521 4522 Output Parameter: 4523 . a - pointer to the device data 4524 4525 Level: developer 4526 4527 Note: 4528 May trigger host-device copies if up-to-date matrix data is on host 4529 4530 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4531 @*/ 4532 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4533 { 4534 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4535 CsrMatrix *csr; 4536 4537 PetscFunctionBegin; 4538 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4539 PetscAssertPointer(a, 2); 4540 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4541 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4542 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4543 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4544 csr = (CsrMatrix *)cusp->mat->mat; 4545 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4546 *a = csr->values->data().get(); 4547 PetscFunctionReturn(PETSC_SUCCESS); 4548 } 4549 4550 /*@C 4551 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4552 4553 Not Collective 4554 4555 Input Parameters: 4556 + A - a `MATSEQAIJCUSPARSE` matrix 4557 - a - pointer to the device data 4558 4559 Level: developer 4560 4561 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4562 @*/ 4563 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4564 { 4565 PetscFunctionBegin; 4566 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4567 PetscAssertPointer(a, 2); 4568 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4569 *a = NULL; 4570 PetscFunctionReturn(PETSC_SUCCESS); 4571 } 4572 4573 /*@C 4574 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4575 4576 Not Collective 4577 4578 Input Parameter: 4579 . A - a `MATSEQAIJCUSPARSE` matrix 4580 4581 Output Parameter: 4582 . a - pointer to the device data 4583 4584 Level: developer 4585 4586 Note: 4587 May trigger host-device copies if up-to-date matrix data is on host 4588 4589 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4590 @*/ 4591 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4592 { 4593 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4594 CsrMatrix *csr; 4595 4596 PetscFunctionBegin; 4597 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4598 PetscAssertPointer(a, 2); 4599 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4600 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4601 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4602 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4603 csr = (CsrMatrix *)cusp->mat->mat; 4604 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4605 *a = csr->values->data().get(); 4606 A->offloadmask = PETSC_OFFLOAD_GPU; 4607 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4608 PetscFunctionReturn(PETSC_SUCCESS); 4609 } 4610 /*@C 4611 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4612 4613 Not Collective 4614 4615 Input Parameters: 4616 + A - a `MATSEQAIJCUSPARSE` matrix 4617 - a - pointer to the device data 4618 4619 Level: developer 4620 4621 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4622 @*/ 4623 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4624 { 4625 PetscFunctionBegin; 4626 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4627 PetscAssertPointer(a, 2); 4628 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4629 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4630 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4631 *a = NULL; 4632 PetscFunctionReturn(PETSC_SUCCESS); 4633 } 4634 4635 /*@C 4636 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4637 4638 Not Collective 4639 4640 Input Parameter: 4641 . A - a `MATSEQAIJCUSPARSE` matrix 4642 4643 Output Parameter: 4644 . a - pointer to the device data 4645 4646 Level: developer 4647 4648 Note: 4649 Does not trigger host-device copies and flags data validity on the GPU 4650 4651 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4652 @*/ 4653 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4654 { 4655 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4656 CsrMatrix *csr; 4657 4658 PetscFunctionBegin; 4659 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4660 PetscAssertPointer(a, 2); 4661 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4662 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4663 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4664 csr = (CsrMatrix *)cusp->mat->mat; 4665 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4666 *a = csr->values->data().get(); 4667 A->offloadmask = PETSC_OFFLOAD_GPU; 4668 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4669 PetscFunctionReturn(PETSC_SUCCESS); 4670 } 4671 4672 /*@C 4673 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4674 4675 Not Collective 4676 4677 Input Parameters: 4678 + A - a `MATSEQAIJCUSPARSE` matrix 4679 - a - pointer to the device data 4680 4681 Level: developer 4682 4683 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4684 @*/ 4685 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4686 { 4687 PetscFunctionBegin; 4688 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4689 PetscAssertPointer(a, 2); 4690 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4691 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4692 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4693 *a = NULL; 4694 PetscFunctionReturn(PETSC_SUCCESS); 4695 } 4696 4697 struct IJCompare4 { 4698 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4699 { 4700 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4701 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4702 return false; 4703 } 4704 }; 4705 4706 struct Shift { 4707 int _shift; 4708 4709 Shift(int shift) : _shift(shift) { } 4710 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4711 }; 4712 4713 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4714 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4715 { 4716 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4717 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4718 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4719 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4720 PetscInt Annz, Bnnz; 4721 cusparseStatus_t stat; 4722 PetscInt i, m, n, zero = 0; 4723 4724 PetscFunctionBegin; 4725 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4726 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4727 PetscAssertPointer(C, 4); 4728 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4729 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4730 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4731 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4732 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4733 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4734 if (reuse == MAT_INITIAL_MATRIX) { 4735 m = A->rmap->n; 4736 n = A->cmap->n + B->cmap->n; 4737 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4738 PetscCall(MatSetSizes(*C, m, n, m, n)); 4739 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4740 c = (Mat_SeqAIJ *)(*C)->data; 4741 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4742 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4743 Ccsr = new CsrMatrix; 4744 Cmat->cprowIndices = NULL; 4745 c->compressedrow.use = PETSC_FALSE; 4746 c->compressedrow.nrows = 0; 4747 c->compressedrow.i = NULL; 4748 c->compressedrow.rindex = NULL; 4749 Ccusp->workVector = NULL; 4750 Ccusp->nrows = m; 4751 Ccusp->mat = Cmat; 4752 Ccusp->mat->mat = Ccsr; 4753 Ccsr->num_rows = m; 4754 Ccsr->num_cols = n; 4755 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4756 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4757 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4758 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4759 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4760 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4761 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4762 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4763 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4764 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4765 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4766 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4767 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4768 4769 Acsr = (CsrMatrix *)Acusp->mat->mat; 4770 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4771 Annz = (PetscInt)Acsr->column_indices->size(); 4772 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4773 c->nz = Annz + Bnnz; 4774 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4775 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4776 Ccsr->values = new THRUSTARRAY(c->nz); 4777 Ccsr->num_entries = c->nz; 4778 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4779 if (c->nz) { 4780 auto Acoo = new THRUSTINTARRAY32(Annz); 4781 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4782 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4783 THRUSTINTARRAY32 *Aroff, *Broff; 4784 4785 if (a->compressedrow.use) { /* need full row offset */ 4786 if (!Acusp->rowoffsets_gpu) { 4787 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4788 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4789 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4790 } 4791 Aroff = Acusp->rowoffsets_gpu; 4792 } else Aroff = Acsr->row_offsets; 4793 if (b->compressedrow.use) { /* need full row offset */ 4794 if (!Bcusp->rowoffsets_gpu) { 4795 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4796 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4797 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4798 } 4799 Broff = Bcusp->rowoffsets_gpu; 4800 } else Broff = Bcsr->row_offsets; 4801 PetscCall(PetscLogGpuTimeBegin()); 4802 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4803 PetscCallCUSPARSE(stat); 4804 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4805 PetscCallCUSPARSE(stat); 4806 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4807 auto Aperm = thrust::make_constant_iterator(1); 4808 auto Bperm = thrust::make_constant_iterator(0); 4809 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4810 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4811 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4812 #else 4813 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4814 auto Bcib = Bcsr->column_indices->begin(); 4815 auto Bcie = Bcsr->column_indices->end(); 4816 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4817 #endif 4818 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4819 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4820 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4821 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4822 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4823 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4824 auto p1 = Ccusp->coords->begin(); 4825 auto p2 = Ccusp->coords->begin(); 4826 thrust::advance(p2, Annz); 4827 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4828 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4829 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4830 #endif 4831 auto cci = thrust::make_counting_iterator(zero); 4832 auto cce = thrust::make_counting_iterator(c->nz); 4833 #if 0 //Errors on SUMMIT cuda 11.1.0 4834 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4835 #else 4836 auto pred = thrust::identity<int>(); 4837 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4838 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4839 #endif 4840 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4841 PetscCallCUSPARSE(stat); 4842 PetscCall(PetscLogGpuTimeEnd()); 4843 delete wPerm; 4844 delete Acoo; 4845 delete Bcoo; 4846 delete Ccoo; 4847 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4848 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4849 PetscCallCUSPARSE(stat); 4850 #endif 4851 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4852 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4853 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4854 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4855 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4856 CsrMatrix *CcsrT = new CsrMatrix; 4857 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4858 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4859 4860 (*C)->form_explicit_transpose = PETSC_TRUE; 4861 (*C)->transupdated = PETSC_TRUE; 4862 Ccusp->rowoffsets_gpu = NULL; 4863 CmatT->cprowIndices = NULL; 4864 CmatT->mat = CcsrT; 4865 CcsrT->num_rows = n; 4866 CcsrT->num_cols = m; 4867 CcsrT->num_entries = c->nz; 4868 4869 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4870 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4871 CcsrT->values = new THRUSTARRAY(c->nz); 4872 4873 PetscCall(PetscLogGpuTimeBegin()); 4874 auto rT = CcsrT->row_offsets->begin(); 4875 if (AT) { 4876 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4877 thrust::advance(rT, -1); 4878 } 4879 if (BT) { 4880 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4881 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4882 thrust::copy(titb, tite, rT); 4883 } 4884 auto cT = CcsrT->column_indices->begin(); 4885 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4886 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4887 auto vT = CcsrT->values->begin(); 4888 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4889 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4890 PetscCall(PetscLogGpuTimeEnd()); 4891 4892 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4893 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4894 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4895 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4896 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4897 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4898 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4899 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4900 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4901 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4902 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4903 PetscCallCUSPARSE(stat); 4904 #endif 4905 Ccusp->matTranspose = CmatT; 4906 } 4907 } 4908 4909 c->free_a = PETSC_TRUE; 4910 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4911 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4912 c->free_ij = PETSC_TRUE; 4913 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4914 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4915 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4916 ii = *Ccsr->row_offsets; 4917 jj = *Ccsr->column_indices; 4918 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4919 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4920 } else { 4921 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4922 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4923 } 4924 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4925 PetscCall(PetscMalloc1(m, &c->ilen)); 4926 PetscCall(PetscMalloc1(m, &c->imax)); 4927 c->maxnz = c->nz; 4928 c->nonzerorowcnt = 0; 4929 c->rmax = 0; 4930 for (i = 0; i < m; i++) { 4931 const PetscInt nn = c->i[i + 1] - c->i[i]; 4932 c->ilen[i] = c->imax[i] = nn; 4933 c->nonzerorowcnt += (PetscInt)!!nn; 4934 c->rmax = PetscMax(c->rmax, nn); 4935 } 4936 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4937 PetscCall(PetscMalloc1(c->nz, &c->a)); 4938 (*C)->nonzerostate++; 4939 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4940 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4941 Ccusp->nonzerostate = (*C)->nonzerostate; 4942 (*C)->preallocated = PETSC_TRUE; 4943 } else { 4944 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4945 c = (Mat_SeqAIJ *)(*C)->data; 4946 if (c->nz) { 4947 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4948 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4949 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4950 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4951 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4952 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4953 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4954 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4955 Acsr = (CsrMatrix *)Acusp->mat->mat; 4956 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4957 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4958 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4959 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4960 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4961 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4962 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4963 auto pmid = Ccusp->coords->begin(); 4964 thrust::advance(pmid, Acsr->num_entries); 4965 PetscCall(PetscLogGpuTimeBegin()); 4966 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4967 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4968 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4969 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4970 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4971 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4972 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4973 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4974 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4975 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4976 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4977 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4978 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4979 auto vT = CcsrT->values->begin(); 4980 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4981 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4982 (*C)->transupdated = PETSC_TRUE; 4983 } 4984 PetscCall(PetscLogGpuTimeEnd()); 4985 } 4986 } 4987 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4988 (*C)->assembled = PETSC_TRUE; 4989 (*C)->was_assembled = PETSC_FALSE; 4990 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4991 PetscFunctionReturn(PETSC_SUCCESS); 4992 } 4993 4994 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4995 { 4996 bool dmem; 4997 const PetscScalar *av; 4998 4999 PetscFunctionBegin; 5000 dmem = isCudaMem(v); 5001 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5002 if (n && idx) { 5003 THRUSTINTARRAY widx(n); 5004 widx.assign(idx, idx + n); 5005 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5006 5007 THRUSTARRAY *w = NULL; 5008 thrust::device_ptr<PetscScalar> dv; 5009 if (dmem) { 5010 dv = thrust::device_pointer_cast(v); 5011 } else { 5012 w = new THRUSTARRAY(n); 5013 dv = w->data(); 5014 } 5015 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5016 5017 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5018 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5019 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5020 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5021 delete w; 5022 } else { 5023 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5024 } 5025 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5026 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5027 PetscFunctionReturn(PETSC_SUCCESS); 5028 } 5029 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 5030