1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #endif 19 #include <thrust/iterator/constant_iterator.h> 20 #include <thrust/remove.h> 21 #include <thrust/sort.h> 22 #include <thrust/unique.h> 23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST) 24 #include <cuda/std/functional> 25 #endif 26 27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 29 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 30 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 31 32 typedef enum { 33 CUSPARSE_MV_ALG_DEFAULT = 0, 34 CUSPARSE_COOMV_ALG = 1, 35 CUSPARSE_CSRMV_ALG1 = 2, 36 CUSPARSE_CSRMV_ALG2 = 3 37 } cusparseSpMVAlg_t; 38 39 typedef enum { 40 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 41 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 42 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 43 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 44 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 45 CUSPARSE_SPMM_ALG_DEFAULT = 0, 46 CUSPARSE_SPMM_COO_ALG1 = 1, 47 CUSPARSE_SPMM_COO_ALG2 = 2, 48 CUSPARSE_SPMM_COO_ALG3 = 3, 49 CUSPARSE_SPMM_COO_ALG4 = 5, 50 CUSPARSE_SPMM_CSR_ALG1 = 4, 51 CUSPARSE_SPMM_CSR_ALG2 = 6, 52 } cusparseSpMMAlg_t; 53 54 typedef enum { 55 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 56 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 57 } cusparseCsr2CscAlg_t; 58 */ 59 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 60 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 61 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 62 #endif 63 64 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 66 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 67 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 68 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 73 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 74 #endif 75 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 76 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 77 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 78 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 81 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 83 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 84 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 85 86 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 88 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 89 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 90 91 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 92 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 93 94 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 95 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 96 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 97 98 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 99 { 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 101 102 PetscFunctionBegin; 103 switch (op) { 104 case MAT_CUSPARSE_MULT: 105 cusparsestruct->format = format; 106 break; 107 case MAT_CUSPARSE_ALL: 108 cusparsestruct->format = format; 109 break; 110 default: 111 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 112 } 113 PetscFunctionReturn(PETSC_SUCCESS); 114 } 115 116 /*@ 117 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 118 operation. Only the `MatMult()` operation can use different GPU storage formats 119 120 Not Collective 121 122 Input Parameters: 123 + A - Matrix of type `MATSEQAIJCUSPARSE` 124 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 125 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 126 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 127 128 Level: intermediate 129 130 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 131 @*/ 132 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 133 { 134 PetscFunctionBegin; 135 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 136 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 140 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 141 { 142 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 143 144 PetscFunctionBegin; 145 cusparsestruct->use_cpu_solve = use_cpu; 146 PetscFunctionReturn(PETSC_SUCCESS); 147 } 148 149 /*@ 150 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 151 152 Input Parameters: 153 + A - Matrix of type `MATSEQAIJCUSPARSE` 154 - use_cpu - set flag for using the built-in CPU `MatSolve()` 155 156 Level: intermediate 157 158 Note: 159 The cuSparse LU solver currently computes the factors with the built-in CPU method 160 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 161 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 162 163 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164 @*/ 165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 166 { 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 169 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 170 PetscFunctionReturn(PETSC_SUCCESS); 171 } 172 173 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 174 { 175 PetscFunctionBegin; 176 switch (op) { 177 case MAT_FORM_EXPLICIT_TRANSPOSE: 178 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 179 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 180 A->form_explicit_transpose = flg; 181 break; 182 default: 183 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 184 break; 185 } 186 PetscFunctionReturn(PETSC_SUCCESS); 187 } 188 189 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 190 { 191 MatCUSPARSEStorageFormat format; 192 PetscBool flg; 193 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 194 195 PetscFunctionBegin; 196 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 197 if (A->factortype == MAT_FACTOR_NONE) { 198 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 199 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 200 201 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 202 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 203 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 204 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 205 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 206 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 207 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 208 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 209 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 210 #else 211 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 212 #endif 213 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 214 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 215 216 PetscCall( 217 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 218 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 219 #endif 220 } 221 PetscOptionsHeadEnd(); 222 PetscFunctionReturn(PETSC_SUCCESS); 223 } 224 225 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 226 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 227 { 228 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 229 PetscInt m = A->rmap->n; 230 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 231 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 232 const MatScalar *Aa = a->a; 233 PetscInt *Mi, *Mj, Mnz; 234 PetscScalar *Ma; 235 236 PetscFunctionBegin; 237 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 238 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 239 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 240 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 241 PetscCall(PetscMalloc1(m + 1, &Mi)); 242 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 243 PetscCall(PetscMalloc1(Mnz, &Ma)); 244 Mi[0] = 0; 245 for (PetscInt i = 0; i < m; i++) { 246 PetscInt llen = Ai[i + 1] - Ai[i]; 247 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 248 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 249 Mj[Mi[i] + llen] = i; // diagonal entry 250 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 251 Mi[i + 1] = Mi[i] + llen + ulen; 252 } 253 // Copy M (L,U) from host to device 254 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 255 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 256 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 257 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 258 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 259 260 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 261 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 262 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 263 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 264 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 265 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 266 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 267 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 268 269 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 270 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 271 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 272 273 fillMode = CUSPARSE_FILL_MODE_UPPER; 274 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 275 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 276 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 277 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 278 279 // Allocate work vectors in SpSv 280 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 281 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 282 283 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 284 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 285 286 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 287 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 288 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 289 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 290 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 291 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 292 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 293 294 // Record for reuse 295 fs->csrRowPtr_h = Mi; 296 fs->csrVal_h = Ma; 297 PetscCall(PetscFree(Mj)); 298 } 299 // Copy the value 300 Mi = fs->csrRowPtr_h; 301 Ma = fs->csrVal_h; 302 Mnz = Mi[m]; 303 for (PetscInt i = 0; i < m; i++) { 304 PetscInt llen = Ai[i + 1] - Ai[i]; 305 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 306 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 307 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 308 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 309 } 310 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 311 312 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 313 if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 314 // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 315 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 316 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 317 } else 318 #endif 319 { 320 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 321 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 322 323 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 324 fs->updatedSpSVAnalysis = PETSC_TRUE; 325 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 326 } 327 } 328 PetscFunctionReturn(PETSC_SUCCESS); 329 } 330 #else 331 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 332 { 333 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 334 PetscInt n = A->rmap->n; 335 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 336 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 337 const PetscInt *ai = a->i, *aj = a->j, *vi; 338 const MatScalar *aa = a->a, *v; 339 PetscInt *AiLo, *AjLo; 340 PetscInt i, nz, nzLower, offset, rowOffset; 341 342 PetscFunctionBegin; 343 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 344 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 345 try { 346 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 347 nzLower = n + ai[n] - ai[1]; 348 if (!loTriFactor) { 349 PetscScalar *AALo; 350 351 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 352 353 /* Allocate Space for the lower triangular matrix */ 354 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 355 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 356 357 /* Fill the lower triangular matrix */ 358 AiLo[0] = (PetscInt)0; 359 AiLo[n] = nzLower; 360 AjLo[0] = (PetscInt)0; 361 AALo[0] = (MatScalar)1.0; 362 v = aa; 363 vi = aj; 364 offset = 1; 365 rowOffset = 1; 366 for (i = 1; i < n; i++) { 367 nz = ai[i + 1] - ai[i]; 368 /* additional 1 for the term on the diagonal */ 369 AiLo[i] = rowOffset; 370 rowOffset += nz + 1; 371 372 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 373 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 374 375 offset += nz; 376 AjLo[offset] = (PetscInt)i; 377 AALo[offset] = (MatScalar)1.0; 378 offset += 1; 379 380 v += nz; 381 vi += nz; 382 } 383 384 /* allocate space for the triangular factor information */ 385 PetscCall(PetscNew(&loTriFactor)); 386 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 387 /* Create the matrix description */ 388 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 389 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 390 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 391 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 392 #else 393 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 394 #endif 395 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 396 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 397 398 /* set the operation */ 399 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 400 401 /* set the matrix */ 402 loTriFactor->csrMat = new CsrMatrix; 403 loTriFactor->csrMat->num_rows = n; 404 loTriFactor->csrMat->num_cols = n; 405 loTriFactor->csrMat->num_entries = nzLower; 406 407 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 408 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 409 410 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 411 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 412 413 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 414 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 415 416 /* Create the solve analysis information */ 417 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 418 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 419 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 420 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 421 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 422 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 423 #endif 424 425 /* perform the solve analysis */ 426 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 427 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 428 PetscCallCUDA(WaitForCUDA()); 429 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 430 431 /* assign the pointer */ 432 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 433 loTriFactor->AA_h = AALo; 434 PetscCallCUDA(cudaFreeHost(AiLo)); 435 PetscCallCUDA(cudaFreeHost(AjLo)); 436 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 437 } else { /* update values only */ 438 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 439 /* Fill the lower triangular matrix */ 440 loTriFactor->AA_h[0] = 1.0; 441 v = aa; 442 vi = aj; 443 offset = 1; 444 for (i = 1; i < n; i++) { 445 nz = ai[i + 1] - ai[i]; 446 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 447 offset += nz; 448 loTriFactor->AA_h[offset] = 1.0; 449 offset += 1; 450 v += nz; 451 } 452 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 453 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 454 } 455 } catch (char *ex) { 456 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 457 } 458 } 459 PetscFunctionReturn(PETSC_SUCCESS); 460 } 461 462 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 463 { 464 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 465 PetscInt n = A->rmap->n; 466 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 467 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 468 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 469 const MatScalar *aa = a->a, *v; 470 PetscInt *AiUp, *AjUp; 471 PetscInt i, nz, nzUpper, offset; 472 473 PetscFunctionBegin; 474 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 475 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 476 try { 477 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 478 nzUpper = adiag[0] - adiag[n]; 479 if (!upTriFactor) { 480 PetscScalar *AAUp; 481 482 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 483 484 /* Allocate Space for the upper triangular matrix */ 485 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 486 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 487 488 /* Fill the upper triangular matrix */ 489 AiUp[0] = (PetscInt)0; 490 AiUp[n] = nzUpper; 491 offset = nzUpper; 492 for (i = n - 1; i >= 0; i--) { 493 v = aa + adiag[i + 1] + 1; 494 vi = aj + adiag[i + 1] + 1; 495 496 /* number of elements NOT on the diagonal */ 497 nz = adiag[i] - adiag[i + 1] - 1; 498 499 /* decrement the offset */ 500 offset -= (nz + 1); 501 502 /* first, set the diagonal elements */ 503 AjUp[offset] = (PetscInt)i; 504 AAUp[offset] = (MatScalar)1. / v[nz]; 505 AiUp[i] = AiUp[i + 1] - (nz + 1); 506 507 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 508 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 509 } 510 511 /* allocate space for the triangular factor information */ 512 PetscCall(PetscNew(&upTriFactor)); 513 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 514 515 /* Create the matrix description */ 516 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 517 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 518 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 519 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 520 #else 521 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 522 #endif 523 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 524 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 525 526 /* set the operation */ 527 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 528 529 /* set the matrix */ 530 upTriFactor->csrMat = new CsrMatrix; 531 upTriFactor->csrMat->num_rows = n; 532 upTriFactor->csrMat->num_cols = n; 533 upTriFactor->csrMat->num_entries = nzUpper; 534 535 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 536 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 537 538 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 539 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 540 541 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 542 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 543 544 /* Create the solve analysis information */ 545 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 546 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 547 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 548 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 549 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 550 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 551 #endif 552 553 /* perform the solve analysis */ 554 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 555 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 556 557 PetscCallCUDA(WaitForCUDA()); 558 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 559 560 /* assign the pointer */ 561 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 562 upTriFactor->AA_h = AAUp; 563 PetscCallCUDA(cudaFreeHost(AiUp)); 564 PetscCallCUDA(cudaFreeHost(AjUp)); 565 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 566 } else { 567 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 568 /* Fill the upper triangular matrix */ 569 offset = nzUpper; 570 for (i = n - 1; i >= 0; i--) { 571 v = aa + adiag[i + 1] + 1; 572 573 /* number of elements NOT on the diagonal */ 574 nz = adiag[i] - adiag[i + 1] - 1; 575 576 /* decrement the offset */ 577 offset -= (nz + 1); 578 579 /* first, set the diagonal elements */ 580 upTriFactor->AA_h[offset] = 1. / v[nz]; 581 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 582 } 583 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 584 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 585 } 586 } catch (char *ex) { 587 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 588 } 589 } 590 PetscFunctionReturn(PETSC_SUCCESS); 591 } 592 #endif 593 594 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 595 { 596 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 597 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 598 IS isrow = a->row, isicol = a->icol; 599 PetscBool row_identity, col_identity; 600 PetscInt n = A->rmap->n; 601 602 PetscFunctionBegin; 603 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 604 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 605 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 606 #else 607 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 608 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 609 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 610 #endif 611 612 cusparseTriFactors->nnz = a->nz; 613 614 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 615 /* lower triangular indices */ 616 PetscCall(ISIdentity(isrow, &row_identity)); 617 if (!row_identity && !cusparseTriFactors->rpermIndices) { 618 const PetscInt *r; 619 620 PetscCall(ISGetIndices(isrow, &r)); 621 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 622 cusparseTriFactors->rpermIndices->assign(r, r + n); 623 PetscCall(ISRestoreIndices(isrow, &r)); 624 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 625 } 626 627 /* upper triangular indices */ 628 PetscCall(ISIdentity(isicol, &col_identity)); 629 if (!col_identity && !cusparseTriFactors->cpermIndices) { 630 const PetscInt *c; 631 632 PetscCall(ISGetIndices(isicol, &c)); 633 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 634 cusparseTriFactors->cpermIndices->assign(c, c + n); 635 PetscCall(ISRestoreIndices(isicol, &c)); 636 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 637 } 638 PetscFunctionReturn(PETSC_SUCCESS); 639 } 640 641 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 642 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 643 { 644 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 645 PetscInt m = A->rmap->n; 646 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 647 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 648 const MatScalar *Aa = a->a; 649 PetscInt *Mj, Mnz; 650 PetscScalar *Ma, *D; 651 652 PetscFunctionBegin; 653 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 654 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 655 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 656 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 657 Mnz = Ai[m]; // Unz (with the unit diagonal) 658 PetscCall(PetscMalloc1(Mnz, &Ma)); 659 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 660 PetscCall(PetscMalloc1(m, &D)); // the diagonal 661 for (PetscInt i = 0; i < m; i++) { 662 PetscInt ulen = Ai[i + 1] - Ai[i]; 663 Mj[Ai[i]] = i; // diagonal entry 664 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 665 } 666 // Copy M (U) from host to device 667 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 668 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 669 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 670 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 671 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 672 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 673 674 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 675 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 676 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 677 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 678 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 679 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 680 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 681 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 682 683 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 684 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 685 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 686 687 // Allocate work vectors in SpSv 688 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 689 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 690 691 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 692 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 693 694 // Query buffer sizes for SpSV and then allocate buffers 695 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 696 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 697 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 698 699 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 700 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 701 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 702 703 // Record for reuse 704 fs->csrVal_h = Ma; 705 fs->diag_h = D; 706 PetscCall(PetscFree(Mj)); 707 } 708 // Copy the value 709 Ma = fs->csrVal_h; 710 D = fs->diag_h; 711 Mnz = Ai[m]; 712 for (PetscInt i = 0; i < m; i++) { 713 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 714 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 715 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 716 } 717 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 718 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 719 720 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 721 if (fs->updatedSpSVAnalysis) { 722 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 723 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 724 } else 725 #endif 726 { 727 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 728 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 729 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 730 fs->updatedSpSVAnalysis = PETSC_TRUE; 731 } 732 } 733 PetscFunctionReturn(PETSC_SUCCESS); 734 } 735 736 // Solve Ut D U x = b 737 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 738 { 739 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 740 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 741 const PetscScalar *barray; 742 PetscScalar *xarray; 743 thrust::device_ptr<const PetscScalar> bGPU; 744 thrust::device_ptr<PetscScalar> xGPU; 745 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 746 PetscInt m = A->rmap->n; 747 748 PetscFunctionBegin; 749 PetscCall(PetscLogGpuTimeBegin()); 750 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 751 PetscCall(VecCUDAGetArrayRead(b, &barray)); 752 xGPU = thrust::device_pointer_cast(xarray); 753 bGPU = thrust::device_pointer_cast(barray); 754 755 // Reorder b with the row permutation if needed, and wrap the result in fs->X 756 if (fs->rpermIndices) { 757 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 758 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 759 } else { 760 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 761 } 762 763 // Solve Ut Y = X 764 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 765 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 766 767 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 768 // It is basically a vector element-wise multiplication, but cublas does not have it! 769 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 770 771 // Solve U X = Y 772 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 773 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 774 } else { 775 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 776 } 777 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 778 779 // Reorder X with the column permutation if needed, and put the result back to x 780 if (fs->cpermIndices) { 781 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 782 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 783 } 784 785 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 786 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 787 PetscCall(PetscLogGpuTimeEnd()); 788 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 789 PetscFunctionReturn(PETSC_SUCCESS); 790 } 791 #else 792 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 793 { 794 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 795 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 796 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 797 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 798 PetscInt *AiUp, *AjUp; 799 PetscScalar *AAUp; 800 PetscScalar *AALo; 801 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 802 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 803 const PetscInt *ai = b->i, *aj = b->j, *vj; 804 const MatScalar *aa = b->a, *v; 805 806 PetscFunctionBegin; 807 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 808 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 809 try { 810 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 811 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 812 if (!upTriFactor && !loTriFactor) { 813 /* Allocate Space for the upper triangular matrix */ 814 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 815 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 816 817 /* Fill the upper triangular matrix */ 818 AiUp[0] = (PetscInt)0; 819 AiUp[n] = nzUpper; 820 offset = 0; 821 for (i = 0; i < n; i++) { 822 /* set the pointers */ 823 v = aa + ai[i]; 824 vj = aj + ai[i]; 825 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 826 827 /* first, set the diagonal elements */ 828 AjUp[offset] = (PetscInt)i; 829 AAUp[offset] = (MatScalar)1.0 / v[nz]; 830 AiUp[i] = offset; 831 AALo[offset] = (MatScalar)1.0 / v[nz]; 832 833 offset += 1; 834 if (nz > 0) { 835 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 836 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 837 for (j = offset; j < offset + nz; j++) { 838 AAUp[j] = -AAUp[j]; 839 AALo[j] = AAUp[j] / v[nz]; 840 } 841 offset += nz; 842 } 843 } 844 845 /* allocate space for the triangular factor information */ 846 PetscCall(PetscNew(&upTriFactor)); 847 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 848 849 /* Create the matrix description */ 850 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 851 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 852 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 853 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 854 #else 855 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 856 #endif 857 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 858 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 859 860 /* set the matrix */ 861 upTriFactor->csrMat = new CsrMatrix; 862 upTriFactor->csrMat->num_rows = A->rmap->n; 863 upTriFactor->csrMat->num_cols = A->cmap->n; 864 upTriFactor->csrMat->num_entries = a->nz; 865 866 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 867 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 868 869 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 870 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 871 872 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 873 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 874 875 /* set the operation */ 876 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 877 878 /* Create the solve analysis information */ 879 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 880 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 881 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 882 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 883 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 884 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 885 #endif 886 887 /* perform the solve analysis */ 888 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 889 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 890 891 PetscCallCUDA(WaitForCUDA()); 892 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 893 894 /* assign the pointer */ 895 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 896 897 /* allocate space for the triangular factor information */ 898 PetscCall(PetscNew(&loTriFactor)); 899 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 900 901 /* Create the matrix description */ 902 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 903 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 904 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 905 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 906 #else 907 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 908 #endif 909 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 910 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 911 912 /* set the operation */ 913 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 914 915 /* set the matrix */ 916 loTriFactor->csrMat = new CsrMatrix; 917 loTriFactor->csrMat->num_rows = A->rmap->n; 918 loTriFactor->csrMat->num_cols = A->cmap->n; 919 loTriFactor->csrMat->num_entries = a->nz; 920 921 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 922 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 923 924 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 925 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 926 927 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 928 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 929 930 /* Create the solve analysis information */ 931 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 932 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 933 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 934 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 935 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 936 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 937 #endif 938 939 /* perform the solve analysis */ 940 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 941 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 942 943 PetscCallCUDA(WaitForCUDA()); 944 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 945 946 /* assign the pointer */ 947 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 948 949 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 950 PetscCallCUDA(cudaFreeHost(AiUp)); 951 PetscCallCUDA(cudaFreeHost(AjUp)); 952 } else { 953 /* Fill the upper triangular matrix */ 954 offset = 0; 955 for (i = 0; i < n; i++) { 956 /* set the pointers */ 957 v = aa + ai[i]; 958 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 959 960 /* first, set the diagonal elements */ 961 AAUp[offset] = 1.0 / v[nz]; 962 AALo[offset] = 1.0 / v[nz]; 963 964 offset += 1; 965 if (nz > 0) { 966 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 967 for (j = offset; j < offset + nz; j++) { 968 AAUp[j] = -AAUp[j]; 969 AALo[j] = AAUp[j] / v[nz]; 970 } 971 offset += nz; 972 } 973 } 974 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 975 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 976 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 977 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 978 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 979 } 980 PetscCallCUDA(cudaFreeHost(AAUp)); 981 PetscCallCUDA(cudaFreeHost(AALo)); 982 } catch (char *ex) { 983 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 984 } 985 } 986 PetscFunctionReturn(PETSC_SUCCESS); 987 } 988 #endif 989 990 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 991 { 992 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 993 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 994 IS ip = a->row; 995 PetscBool perm_identity; 996 PetscInt n = A->rmap->n; 997 998 PetscFunctionBegin; 999 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 1000 1001 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1002 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 1003 #else 1004 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 1005 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 1006 #endif 1007 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 1008 1009 A->offloadmask = PETSC_OFFLOAD_BOTH; 1010 1011 /* lower triangular indices */ 1012 PetscCall(ISIdentity(ip, &perm_identity)); 1013 if (!perm_identity) { 1014 IS iip; 1015 const PetscInt *irip, *rip; 1016 1017 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 1018 PetscCall(ISGetIndices(iip, &irip)); 1019 PetscCall(ISGetIndices(ip, &rip)); 1020 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1021 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1022 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1023 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1024 PetscCall(ISRestoreIndices(iip, &irip)); 1025 PetscCall(ISDestroy(&iip)); 1026 PetscCall(ISRestoreIndices(ip, &rip)); 1027 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1028 } 1029 PetscFunctionReturn(PETSC_SUCCESS); 1030 } 1031 1032 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1033 { 1034 PetscFunctionBegin; 1035 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1036 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1037 B->offloadmask = PETSC_OFFLOAD_CPU; 1038 1039 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1040 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1041 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1042 #else 1043 /* determine which version of MatSolve needs to be used. */ 1044 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1045 IS ip = b->row; 1046 PetscBool perm_identity; 1047 1048 PetscCall(ISIdentity(ip, &perm_identity)); 1049 if (perm_identity) { 1050 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1051 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1052 } else { 1053 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1054 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1055 } 1056 #endif 1057 B->ops->matsolve = NULL; 1058 B->ops->matsolvetranspose = NULL; 1059 1060 /* get the triangular factors */ 1061 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1062 PetscFunctionReturn(PETSC_SUCCESS); 1063 } 1064 1065 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1066 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1067 { 1068 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1069 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1070 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1071 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1072 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1073 cusparseIndexBase_t indexBase; 1074 cusparseMatrixType_t matrixType; 1075 cusparseFillMode_t fillMode; 1076 cusparseDiagType_t diagType; 1077 1078 PetscFunctionBegin; 1079 /* allocate space for the transpose of the lower triangular factor */ 1080 PetscCall(PetscNew(&loTriFactorT)); 1081 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1082 1083 /* set the matrix descriptors of the lower triangular factor */ 1084 matrixType = cusparseGetMatType(loTriFactor->descr); 1085 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1086 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1087 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1088 1089 /* Create the matrix description */ 1090 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1091 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1092 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1093 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1094 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1095 1096 /* set the operation */ 1097 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1098 1099 /* allocate GPU space for the CSC of the lower triangular factor*/ 1100 loTriFactorT->csrMat = new CsrMatrix; 1101 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1102 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1103 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1104 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1105 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1106 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1107 1108 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1109 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1110 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1111 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1112 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1113 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1114 #endif 1115 1116 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1117 { 1118 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1119 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1120 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1121 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1122 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1123 #else 1124 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1125 #endif 1126 PetscCallCUSPARSE(stat); 1127 } 1128 1129 PetscCallCUDA(WaitForCUDA()); 1130 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1131 1132 /* Create the solve analysis information */ 1133 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1134 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1135 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1136 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1137 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1138 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1139 #endif 1140 1141 /* perform the solve analysis */ 1142 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1143 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1144 1145 PetscCallCUDA(WaitForCUDA()); 1146 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1147 1148 /* assign the pointer */ 1149 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1150 1151 /*********************************************/ 1152 /* Now the Transpose of the Upper Tri Factor */ 1153 /*********************************************/ 1154 1155 /* allocate space for the transpose of the upper triangular factor */ 1156 PetscCall(PetscNew(&upTriFactorT)); 1157 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1158 1159 /* set the matrix descriptors of the upper triangular factor */ 1160 matrixType = cusparseGetMatType(upTriFactor->descr); 1161 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1162 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1163 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1164 1165 /* Create the matrix description */ 1166 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1167 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1168 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1169 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1170 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1171 1172 /* set the operation */ 1173 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1174 1175 /* allocate GPU space for the CSC of the upper triangular factor*/ 1176 upTriFactorT->csrMat = new CsrMatrix; 1177 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1178 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1179 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1180 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1181 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1182 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1183 1184 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1185 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1186 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1187 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1188 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1189 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1190 #endif 1191 1192 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1193 { 1194 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1195 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1196 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1197 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1198 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1199 #else 1200 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1201 #endif 1202 PetscCallCUSPARSE(stat); 1203 } 1204 1205 PetscCallCUDA(WaitForCUDA()); 1206 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1207 1208 /* Create the solve analysis information */ 1209 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1210 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1211 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1212 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1213 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1214 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1215 #endif 1216 1217 /* perform the solve analysis */ 1218 /* christ, would it have killed you to put this stuff in a function????????? */ 1219 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1220 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1221 1222 PetscCallCUDA(WaitForCUDA()); 1223 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1224 1225 /* assign the pointer */ 1226 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1227 PetscFunctionReturn(PETSC_SUCCESS); 1228 } 1229 #endif 1230 1231 struct PetscScalarToPetscInt { 1232 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1233 }; 1234 1235 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1236 { 1237 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1238 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1239 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1240 cusparseStatus_t stat; 1241 cusparseIndexBase_t indexBase; 1242 1243 PetscFunctionBegin; 1244 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1245 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1246 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1247 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1248 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1249 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1250 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1251 PetscCall(PetscLogGpuTimeBegin()); 1252 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1253 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1254 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1255 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1256 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1257 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1258 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1259 1260 /* set alpha and beta */ 1261 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1262 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1263 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1264 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1265 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1266 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1267 1268 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1269 CsrMatrix *matrixT = new CsrMatrix; 1270 matstructT->mat = matrixT; 1271 matrixT->num_rows = A->cmap->n; 1272 matrixT->num_cols = A->rmap->n; 1273 matrixT->num_entries = a->nz; 1274 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1275 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1276 matrixT->values = new THRUSTARRAY(a->nz); 1277 1278 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1279 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1280 1281 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1282 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1283 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1284 indexBase, cusparse_scalartype); 1285 PetscCallCUSPARSE(stat); 1286 #else 1287 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1288 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1289 1290 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1291 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1292 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1293 */ 1294 if (matrixT->num_entries) { 1295 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1296 PetscCallCUSPARSE(stat); 1297 1298 } else { 1299 matstructT->matDescr = NULL; 1300 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1301 } 1302 #endif 1303 #endif 1304 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1305 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1306 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1307 #else 1308 CsrMatrix *temp = new CsrMatrix; 1309 CsrMatrix *tempT = new CsrMatrix; 1310 /* First convert HYB to CSR */ 1311 temp->num_rows = A->rmap->n; 1312 temp->num_cols = A->cmap->n; 1313 temp->num_entries = a->nz; 1314 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1315 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1316 temp->values = new THRUSTARRAY(a->nz); 1317 1318 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1319 PetscCallCUSPARSE(stat); 1320 1321 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1322 tempT->num_rows = A->rmap->n; 1323 tempT->num_cols = A->cmap->n; 1324 tempT->num_entries = a->nz; 1325 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1326 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1327 tempT->values = new THRUSTARRAY(a->nz); 1328 1329 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1330 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1331 PetscCallCUSPARSE(stat); 1332 1333 /* Last, convert CSC to HYB */ 1334 cusparseHybMat_t hybMat; 1335 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1336 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1337 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1338 PetscCallCUSPARSE(stat); 1339 1340 /* assign the pointer */ 1341 matstructT->mat = hybMat; 1342 A->transupdated = PETSC_TRUE; 1343 /* delete temporaries */ 1344 if (tempT) { 1345 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1346 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1347 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1348 delete (CsrMatrix *)tempT; 1349 } 1350 if (temp) { 1351 if (temp->values) delete (THRUSTARRAY *)temp->values; 1352 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1353 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1354 delete (CsrMatrix *)temp; 1355 } 1356 #endif 1357 } 1358 } 1359 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1360 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1361 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1362 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1363 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1364 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1365 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1366 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1367 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1368 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1369 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1370 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1371 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1372 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1373 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1374 } 1375 if (!cusparsestruct->csr2csc_i) { 1376 THRUSTARRAY csr2csc_a(matrix->num_entries); 1377 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1378 1379 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1380 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1381 void *csr2cscBuffer; 1382 size_t csr2cscBufferSize; 1383 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1384 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1385 PetscCallCUSPARSE(stat); 1386 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1387 #endif 1388 1389 if (matrix->num_entries) { 1390 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1391 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1392 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1393 1394 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1395 should be filled with indexBase. So I just take a shortcut here. 1396 */ 1397 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1398 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1399 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1400 PetscCallCUSPARSE(stat); 1401 #else 1402 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1403 PetscCallCUSPARSE(stat); 1404 #endif 1405 } else { 1406 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1407 } 1408 1409 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1410 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1411 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1412 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1413 #endif 1414 } 1415 PetscCallThrust( 1416 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1417 } 1418 PetscCall(PetscLogGpuTimeEnd()); 1419 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1420 /* the compressed row indices is not used for matTranspose */ 1421 matstructT->cprowIndices = NULL; 1422 /* assign the pointer */ 1423 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1424 A->transupdated = PETSC_TRUE; 1425 PetscFunctionReturn(PETSC_SUCCESS); 1426 } 1427 1428 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1429 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1430 { 1431 const PetscScalar *barray; 1432 PetscScalar *xarray; 1433 thrust::device_ptr<const PetscScalar> bGPU; 1434 thrust::device_ptr<PetscScalar> xGPU; 1435 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1436 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1437 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1438 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1439 PetscInt m = A->rmap->n; 1440 1441 PetscFunctionBegin; 1442 PetscCall(PetscLogGpuTimeBegin()); 1443 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1444 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1445 xGPU = thrust::device_pointer_cast(xarray); 1446 bGPU = thrust::device_pointer_cast(barray); 1447 1448 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1449 if (fs->rpermIndices) { 1450 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1451 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1452 } else { 1453 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1454 } 1455 1456 // Solve L Y = X 1457 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1458 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1459 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1460 1461 // Solve U X = Y 1462 if (fs->cpermIndices) { 1463 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1464 } else { 1465 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1466 } 1467 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1468 1469 // Reorder X with the column permutation if needed, and put the result back to x 1470 if (fs->cpermIndices) { 1471 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1472 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1473 } 1474 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1475 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1476 PetscCall(PetscLogGpuTimeEnd()); 1477 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1478 PetscFunctionReturn(PETSC_SUCCESS); 1479 } 1480 1481 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1482 { 1483 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1484 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1485 const PetscScalar *barray; 1486 PetscScalar *xarray; 1487 thrust::device_ptr<const PetscScalar> bGPU; 1488 thrust::device_ptr<PetscScalar> xGPU; 1489 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1490 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1491 PetscInt m = A->rmap->n; 1492 1493 PetscFunctionBegin; 1494 PetscCall(PetscLogGpuTimeBegin()); 1495 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1496 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1497 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1498 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1499 1500 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1501 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1502 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1503 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1504 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1505 } 1506 1507 if (!fs->updatedTransposeSpSVAnalysis) { 1508 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1509 1510 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1511 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1512 } 1513 1514 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1515 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1516 xGPU = thrust::device_pointer_cast(xarray); 1517 bGPU = thrust::device_pointer_cast(barray); 1518 1519 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1520 if (fs->rpermIndices) { 1521 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1522 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1523 } else { 1524 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1525 } 1526 1527 // Solve Ut Y = X 1528 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1529 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1530 1531 // Solve Lt X = Y 1532 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1533 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1534 } else { 1535 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1536 } 1537 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1538 1539 // Reorder X with the column permutation if needed, and put the result back to x 1540 if (fs->cpermIndices) { 1541 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1542 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1543 } 1544 1545 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1546 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1547 PetscCall(PetscLogGpuTimeEnd()); 1548 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1549 PetscFunctionReturn(PETSC_SUCCESS); 1550 } 1551 #else 1552 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1553 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1554 { 1555 PetscInt n = xx->map->n; 1556 const PetscScalar *barray; 1557 PetscScalar *xarray; 1558 thrust::device_ptr<const PetscScalar> bGPU; 1559 thrust::device_ptr<PetscScalar> xGPU; 1560 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1561 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1562 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1563 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1564 1565 PetscFunctionBegin; 1566 /* Analyze the matrix and create the transpose ... on the fly */ 1567 if (!loTriFactorT && !upTriFactorT) { 1568 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1569 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1570 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1571 } 1572 1573 /* Get the GPU pointers */ 1574 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1575 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1576 xGPU = thrust::device_pointer_cast(xarray); 1577 bGPU = thrust::device_pointer_cast(barray); 1578 1579 PetscCall(PetscLogGpuTimeBegin()); 1580 /* First, reorder with the row permutation */ 1581 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1582 1583 /* First, solve U */ 1584 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1585 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1586 1587 /* Then, solve L */ 1588 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1589 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1590 1591 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1592 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1593 1594 /* Copy the temporary to the full solution. */ 1595 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1596 1597 /* restore */ 1598 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1599 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1600 PetscCall(PetscLogGpuTimeEnd()); 1601 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1602 PetscFunctionReturn(PETSC_SUCCESS); 1603 } 1604 1605 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1606 { 1607 const PetscScalar *barray; 1608 PetscScalar *xarray; 1609 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1610 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1611 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1612 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1613 1614 PetscFunctionBegin; 1615 /* Analyze the matrix and create the transpose ... on the fly */ 1616 if (!loTriFactorT && !upTriFactorT) { 1617 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1618 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1619 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1620 } 1621 1622 /* Get the GPU pointers */ 1623 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1624 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1625 1626 PetscCall(PetscLogGpuTimeBegin()); 1627 /* First, solve U */ 1628 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1629 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1630 1631 /* Then, solve L */ 1632 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1633 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1634 1635 /* restore */ 1636 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1637 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1638 PetscCall(PetscLogGpuTimeEnd()); 1639 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1640 PetscFunctionReturn(PETSC_SUCCESS); 1641 } 1642 1643 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1644 { 1645 const PetscScalar *barray; 1646 PetscScalar *xarray; 1647 thrust::device_ptr<const PetscScalar> bGPU; 1648 thrust::device_ptr<PetscScalar> xGPU; 1649 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1650 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1651 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1652 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1653 1654 PetscFunctionBegin; 1655 /* Get the GPU pointers */ 1656 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1657 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1658 xGPU = thrust::device_pointer_cast(xarray); 1659 bGPU = thrust::device_pointer_cast(barray); 1660 1661 PetscCall(PetscLogGpuTimeBegin()); 1662 /* First, reorder with the row permutation */ 1663 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1664 1665 /* Next, solve L */ 1666 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1667 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1668 1669 /* Then, solve U */ 1670 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1671 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1672 1673 /* Last, reorder with the column permutation */ 1674 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1675 1676 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1677 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1678 PetscCall(PetscLogGpuTimeEnd()); 1679 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1680 PetscFunctionReturn(PETSC_SUCCESS); 1681 } 1682 1683 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1684 { 1685 const PetscScalar *barray; 1686 PetscScalar *xarray; 1687 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1688 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1689 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1690 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1691 1692 PetscFunctionBegin; 1693 /* Get the GPU pointers */ 1694 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1695 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1696 1697 PetscCall(PetscLogGpuTimeBegin()); 1698 /* First, solve L */ 1699 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1700 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1701 1702 /* Next, solve U */ 1703 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1704 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1705 1706 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1707 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1708 PetscCall(PetscLogGpuTimeEnd()); 1709 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1710 PetscFunctionReturn(PETSC_SUCCESS); 1711 } 1712 #endif 1713 1714 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1715 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1716 { 1717 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1718 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1719 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1720 CsrMatrix *Acsr; 1721 PetscInt m, nz; 1722 PetscBool flg; 1723 1724 PetscFunctionBegin; 1725 if (PetscDefined(USE_DEBUG)) { 1726 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1727 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1728 } 1729 1730 /* Copy A's value to fact */ 1731 m = fact->rmap->n; 1732 nz = aij->nz; 1733 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1734 Acsr = (CsrMatrix *)Acusp->mat->mat; 1735 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1736 1737 PetscCall(PetscLogGpuTimeBegin()); 1738 /* Factorize fact inplace */ 1739 if (m) 1740 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1741 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1742 if (PetscDefined(USE_DEBUG)) { 1743 int numerical_zero; 1744 cusparseStatus_t status; 1745 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1746 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1747 } 1748 1749 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1750 if (fs->updatedSpSVAnalysis) { 1751 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1752 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1753 } else 1754 #endif 1755 { 1756 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1757 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1758 */ 1759 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1760 1761 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1762 1763 fs->updatedSpSVAnalysis = PETSC_TRUE; 1764 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1765 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1766 } 1767 1768 fact->offloadmask = PETSC_OFFLOAD_GPU; 1769 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1770 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1771 fact->ops->matsolve = NULL; 1772 fact->ops->matsolvetranspose = NULL; 1773 PetscCall(PetscLogGpuTimeEnd()); 1774 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1775 PetscFunctionReturn(PETSC_SUCCESS); 1776 } 1777 1778 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1779 { 1780 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1781 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1782 PetscInt m, nz; 1783 1784 PetscFunctionBegin; 1785 if (PetscDefined(USE_DEBUG)) { 1786 PetscInt i; 1787 PetscBool flg, missing; 1788 1789 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1790 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1791 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1792 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1793 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1794 } 1795 1796 /* Free the old stale stuff */ 1797 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1798 1799 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1800 but they will not be used. Allocate them just for easy debugging. 1801 */ 1802 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1803 1804 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1805 fact->factortype = MAT_FACTOR_ILU; 1806 fact->info.factor_mallocs = 0; 1807 fact->info.fill_ratio_given = info->fill; 1808 fact->info.fill_ratio_needed = 1.0; 1809 1810 aij->row = NULL; 1811 aij->col = NULL; 1812 1813 /* ====================================================================== */ 1814 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1815 /* We'll do in-place factorization on fact */ 1816 /* ====================================================================== */ 1817 const int *Ai, *Aj; 1818 1819 m = fact->rmap->n; 1820 nz = aij->nz; 1821 1822 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1823 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1824 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1825 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1826 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1827 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1828 1829 /* ====================================================================== */ 1830 /* Create descriptors for M, L, U */ 1831 /* ====================================================================== */ 1832 cusparseFillMode_t fillMode; 1833 cusparseDiagType_t diagType; 1834 1835 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1836 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1837 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1838 1839 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1840 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1841 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1842 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1843 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1844 */ 1845 fillMode = CUSPARSE_FILL_MODE_LOWER; 1846 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1847 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1848 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1849 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1850 1851 fillMode = CUSPARSE_FILL_MODE_UPPER; 1852 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1853 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1854 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1855 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1856 1857 /* ========================================================================= */ 1858 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1859 /* ========================================================================= */ 1860 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1861 if (m) 1862 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1863 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1864 1865 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1866 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1867 1868 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1869 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1870 1871 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1872 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1873 1874 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1875 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1876 1877 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1878 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1879 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1880 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1881 */ 1882 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1883 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1884 fs->spsvBuffer_L = fs->factBuffer_M; 1885 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1886 } else { 1887 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1888 fs->spsvBuffer_U = fs->factBuffer_M; 1889 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1890 } 1891 1892 /* ========================================================================== */ 1893 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1894 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1895 /* ========================================================================== */ 1896 int structural_zero; 1897 cusparseStatus_t status; 1898 1899 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1900 if (m) 1901 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1902 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1903 if (PetscDefined(USE_DEBUG)) { 1904 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1905 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1906 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1907 } 1908 1909 /* Estimate FLOPs of the numeric factorization */ 1910 { 1911 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1912 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1913 PetscLogDouble flops = 0.0; 1914 1915 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1916 Ai = Aseq->i; 1917 Adiag = Aseq->diag; 1918 for (PetscInt i = 0; i < m; i++) { 1919 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1920 nzRow = Ai[i + 1] - Ai[i]; 1921 nzLeft = Adiag[i] - Ai[i]; 1922 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1923 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1924 */ 1925 nzLeft = (nzRow - 1) / 2; 1926 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1927 } 1928 } 1929 fs->numericFactFlops = flops; 1930 } 1931 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1932 PetscFunctionReturn(PETSC_SUCCESS); 1933 } 1934 1935 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1936 { 1937 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1938 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1939 const PetscScalar *barray; 1940 PetscScalar *xarray; 1941 1942 PetscFunctionBegin; 1943 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1944 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1945 PetscCall(PetscLogGpuTimeBegin()); 1946 1947 /* Solve L*y = b */ 1948 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1949 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1950 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1951 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1952 1953 /* Solve Lt*x = y */ 1954 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1955 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1956 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1957 1958 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1959 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1960 1961 PetscCall(PetscLogGpuTimeEnd()); 1962 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1963 PetscFunctionReturn(PETSC_SUCCESS); 1964 } 1965 1966 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1967 { 1968 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1969 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1970 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1971 CsrMatrix *Acsr; 1972 PetscInt m, nz; 1973 PetscBool flg; 1974 1975 PetscFunctionBegin; 1976 if (PetscDefined(USE_DEBUG)) { 1977 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1978 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1979 } 1980 1981 /* Copy A's value to fact */ 1982 m = fact->rmap->n; 1983 nz = aij->nz; 1984 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1985 Acsr = (CsrMatrix *)Acusp->mat->mat; 1986 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1987 1988 /* Factorize fact inplace */ 1989 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1990 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1991 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1992 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1993 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1994 */ 1995 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1996 if (PetscDefined(USE_DEBUG)) { 1997 int numerical_zero; 1998 cusparseStatus_t status; 1999 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 2000 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 2001 } 2002 2003 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 2004 if (fs->updatedSpSVAnalysis) { 2005 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 2006 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 2007 } else 2008 #endif 2009 { 2010 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 2011 2012 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2013 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2014 */ 2015 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 2016 fs->updatedSpSVAnalysis = PETSC_TRUE; 2017 } 2018 2019 fact->offloadmask = PETSC_OFFLOAD_GPU; 2020 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2021 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2022 fact->ops->matsolve = NULL; 2023 fact->ops->matsolvetranspose = NULL; 2024 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2025 PetscFunctionReturn(PETSC_SUCCESS); 2026 } 2027 2028 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2029 { 2030 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2031 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2032 PetscInt m, nz; 2033 2034 PetscFunctionBegin; 2035 if (PetscDefined(USE_DEBUG)) { 2036 PetscInt i; 2037 PetscBool flg, missing; 2038 2039 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2040 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2041 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2042 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2043 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2044 } 2045 2046 /* Free the old stale stuff */ 2047 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2048 2049 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2050 but they will not be used. Allocate them just for easy debugging. 2051 */ 2052 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2053 2054 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2055 fact->factortype = MAT_FACTOR_ICC; 2056 fact->info.factor_mallocs = 0; 2057 fact->info.fill_ratio_given = info->fill; 2058 fact->info.fill_ratio_needed = 1.0; 2059 2060 aij->row = NULL; 2061 aij->col = NULL; 2062 2063 /* ====================================================================== */ 2064 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2065 /* We'll do in-place factorization on fact */ 2066 /* ====================================================================== */ 2067 const int *Ai, *Aj; 2068 2069 m = fact->rmap->n; 2070 nz = aij->nz; 2071 2072 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2073 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2074 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2075 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2076 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2077 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2078 2079 /* ====================================================================== */ 2080 /* Create mat descriptors for M, L */ 2081 /* ====================================================================== */ 2082 cusparseFillMode_t fillMode; 2083 cusparseDiagType_t diagType; 2084 2085 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2086 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2087 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2088 2089 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2090 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2091 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2092 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2093 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2094 */ 2095 fillMode = CUSPARSE_FILL_MODE_LOWER; 2096 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2097 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2098 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2099 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2100 2101 /* ========================================================================= */ 2102 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2103 /* ========================================================================= */ 2104 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2105 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2106 2107 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2108 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2109 2110 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2111 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2112 2113 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2114 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2115 2116 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2117 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2118 2119 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2120 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2121 */ 2122 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2123 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2124 fs->spsvBuffer_L = fs->factBuffer_M; 2125 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2126 } else { 2127 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2128 fs->spsvBuffer_Lt = fs->factBuffer_M; 2129 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2130 } 2131 2132 /* ========================================================================== */ 2133 /* Perform analysis of ic0 on M */ 2134 /* The lower triangular part of M has the same sparsity pattern as L */ 2135 /* ========================================================================== */ 2136 int structural_zero; 2137 cusparseStatus_t status; 2138 2139 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2140 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2141 if (PetscDefined(USE_DEBUG)) { 2142 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2143 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2144 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2145 } 2146 2147 /* Estimate FLOPs of the numeric factorization */ 2148 { 2149 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2150 PetscInt *Ai, nzRow, nzLeft; 2151 PetscLogDouble flops = 0.0; 2152 2153 Ai = Aseq->i; 2154 for (PetscInt i = 0; i < m; i++) { 2155 nzRow = Ai[i + 1] - Ai[i]; 2156 if (nzRow > 1) { 2157 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2158 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2159 */ 2160 nzLeft = (nzRow - 1) / 2; 2161 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2162 } 2163 } 2164 fs->numericFactFlops = flops; 2165 } 2166 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2167 PetscFunctionReturn(PETSC_SUCCESS); 2168 } 2169 #endif 2170 2171 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2172 { 2173 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2174 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2175 2176 PetscFunctionBegin; 2177 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2178 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2179 B->offloadmask = PETSC_OFFLOAD_CPU; 2180 2181 if (!cusparsestruct->use_cpu_solve) { 2182 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2183 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2184 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2185 #else 2186 /* determine which version of MatSolve needs to be used. */ 2187 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2188 IS isrow = b->row, iscol = b->col; 2189 PetscBool row_identity, col_identity; 2190 2191 PetscCall(ISIdentity(isrow, &row_identity)); 2192 PetscCall(ISIdentity(iscol, &col_identity)); 2193 if (row_identity && col_identity) { 2194 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2195 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2196 } else { 2197 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2198 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2199 } 2200 #endif 2201 } 2202 B->ops->matsolve = NULL; 2203 B->ops->matsolvetranspose = NULL; 2204 2205 /* get the triangular factors */ 2206 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2207 PetscFunctionReturn(PETSC_SUCCESS); 2208 } 2209 2210 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2211 { 2212 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2213 2214 PetscFunctionBegin; 2215 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2216 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2217 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2218 PetscFunctionReturn(PETSC_SUCCESS); 2219 } 2220 2221 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2222 { 2223 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2224 2225 PetscFunctionBegin; 2226 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2227 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2228 if (!info->factoronhost) { 2229 PetscCall(ISIdentity(isrow, &row_identity)); 2230 PetscCall(ISIdentity(iscol, &col_identity)); 2231 } 2232 if (!info->levels && row_identity && col_identity) { 2233 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2234 } else 2235 #endif 2236 { 2237 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2238 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2239 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2240 } 2241 PetscFunctionReturn(PETSC_SUCCESS); 2242 } 2243 2244 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2245 { 2246 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2247 2248 PetscFunctionBegin; 2249 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2250 PetscBool perm_identity = PETSC_FALSE; 2251 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2252 if (!info->levels && perm_identity) { 2253 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2254 } else 2255 #endif 2256 { 2257 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2258 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2259 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2260 } 2261 PetscFunctionReturn(PETSC_SUCCESS); 2262 } 2263 2264 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2265 { 2266 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2267 2268 PetscFunctionBegin; 2269 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2270 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2271 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2272 PetscFunctionReturn(PETSC_SUCCESS); 2273 } 2274 2275 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2276 { 2277 PetscFunctionBegin; 2278 *type = MATSOLVERCUSPARSE; 2279 PetscFunctionReturn(PETSC_SUCCESS); 2280 } 2281 2282 /*MC 2283 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2284 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2285 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2286 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2287 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2288 algorithms are not recommended. This class does NOT support direct solver operations. 2289 2290 Level: beginner 2291 2292 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2293 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2294 M*/ 2295 2296 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2297 { 2298 PetscInt n = A->rmap->n; 2299 2300 PetscFunctionBegin; 2301 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2302 PetscCall(MatSetSizes(*B, n, n, n, n)); 2303 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2304 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2305 2306 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2307 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2308 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2309 if (!A->boundtocpu) { 2310 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2311 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2312 } else { 2313 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2314 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2315 } 2316 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2317 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2318 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2319 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2320 if (!A->boundtocpu) { 2321 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2322 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2323 } else { 2324 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2325 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2326 } 2327 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2328 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2329 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2330 2331 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2332 (*B)->canuseordering = PETSC_TRUE; 2333 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2334 PetscFunctionReturn(PETSC_SUCCESS); 2335 } 2336 2337 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2338 { 2339 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2340 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2341 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2342 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2343 #endif 2344 2345 PetscFunctionBegin; 2346 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2347 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2348 if (A->factortype == MAT_FACTOR_NONE) { 2349 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2350 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2351 } 2352 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2353 else if (fs->csrVal) { 2354 /* We have a factorized matrix on device and are able to copy it to host */ 2355 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2356 } 2357 #endif 2358 else 2359 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2360 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2361 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2362 A->offloadmask = PETSC_OFFLOAD_BOTH; 2363 } 2364 PetscFunctionReturn(PETSC_SUCCESS); 2365 } 2366 2367 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2368 { 2369 PetscFunctionBegin; 2370 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2371 *array = ((Mat_SeqAIJ *)A->data)->a; 2372 PetscFunctionReturn(PETSC_SUCCESS); 2373 } 2374 2375 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2376 { 2377 PetscFunctionBegin; 2378 A->offloadmask = PETSC_OFFLOAD_CPU; 2379 *array = NULL; 2380 PetscFunctionReturn(PETSC_SUCCESS); 2381 } 2382 2383 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2384 { 2385 PetscFunctionBegin; 2386 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2387 *array = ((Mat_SeqAIJ *)A->data)->a; 2388 PetscFunctionReturn(PETSC_SUCCESS); 2389 } 2390 2391 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2392 { 2393 PetscFunctionBegin; 2394 *array = NULL; 2395 PetscFunctionReturn(PETSC_SUCCESS); 2396 } 2397 2398 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2399 { 2400 PetscFunctionBegin; 2401 *array = ((Mat_SeqAIJ *)A->data)->a; 2402 PetscFunctionReturn(PETSC_SUCCESS); 2403 } 2404 2405 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2406 { 2407 PetscFunctionBegin; 2408 A->offloadmask = PETSC_OFFLOAD_CPU; 2409 *array = NULL; 2410 PetscFunctionReturn(PETSC_SUCCESS); 2411 } 2412 2413 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2414 { 2415 Mat_SeqAIJCUSPARSE *cusp; 2416 CsrMatrix *matrix; 2417 2418 PetscFunctionBegin; 2419 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2420 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2421 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2422 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2423 matrix = (CsrMatrix *)cusp->mat->mat; 2424 2425 if (i) { 2426 #if !defined(PETSC_USE_64BIT_INDICES) 2427 *i = matrix->row_offsets->data().get(); 2428 #else 2429 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2430 #endif 2431 } 2432 if (j) { 2433 #if !defined(PETSC_USE_64BIT_INDICES) 2434 *j = matrix->column_indices->data().get(); 2435 #else 2436 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2437 #endif 2438 } 2439 if (a) *a = matrix->values->data().get(); 2440 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2441 PetscFunctionReturn(PETSC_SUCCESS); 2442 } 2443 2444 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2445 { 2446 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2447 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2448 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2449 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2450 cusparseStatus_t stat; 2451 PetscBool both = PETSC_TRUE; 2452 2453 PetscFunctionBegin; 2454 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2455 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2456 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2457 CsrMatrix *matrix; 2458 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2459 2460 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2461 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2462 matrix->values->assign(a->a, a->a + a->nz); 2463 PetscCallCUDA(WaitForCUDA()); 2464 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2465 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2466 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2467 } else { 2468 PetscInt nnz; 2469 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2470 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2471 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2472 delete cusparsestruct->workVector; 2473 delete cusparsestruct->rowoffsets_gpu; 2474 cusparsestruct->workVector = NULL; 2475 cusparsestruct->rowoffsets_gpu = NULL; 2476 try { 2477 if (a->compressedrow.use) { 2478 m = a->compressedrow.nrows; 2479 ii = a->compressedrow.i; 2480 ridx = a->compressedrow.rindex; 2481 } else { 2482 m = A->rmap->n; 2483 ii = a->i; 2484 ridx = NULL; 2485 } 2486 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2487 if (!a->a) { 2488 nnz = ii[m]; 2489 both = PETSC_FALSE; 2490 } else nnz = a->nz; 2491 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2492 2493 /* create cusparse matrix */ 2494 cusparsestruct->nrows = m; 2495 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2496 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2497 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2498 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2499 2500 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2501 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2502 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2503 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2504 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2505 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2506 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2507 2508 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2509 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2510 /* set the matrix */ 2511 CsrMatrix *mat = new CsrMatrix; 2512 mat->num_rows = m; 2513 mat->num_cols = A->cmap->n; 2514 mat->num_entries = nnz; 2515 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2516 mat->row_offsets->assign(ii, ii + m + 1); 2517 2518 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2519 mat->column_indices->assign(a->j, a->j + nnz); 2520 2521 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2522 if (a->a) mat->values->assign(a->a, a->a + nnz); 2523 2524 /* assign the pointer */ 2525 matstruct->mat = mat; 2526 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2527 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2528 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2529 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2530 PetscCallCUSPARSE(stat); 2531 } 2532 #endif 2533 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2534 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2535 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2536 #else 2537 CsrMatrix *mat = new CsrMatrix; 2538 mat->num_rows = m; 2539 mat->num_cols = A->cmap->n; 2540 mat->num_entries = nnz; 2541 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2542 mat->row_offsets->assign(ii, ii + m + 1); 2543 2544 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2545 mat->column_indices->assign(a->j, a->j + nnz); 2546 2547 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2548 if (a->a) mat->values->assign(a->a, a->a + nnz); 2549 2550 cusparseHybMat_t hybMat; 2551 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2552 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2553 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2554 PetscCallCUSPARSE(stat); 2555 /* assign the pointer */ 2556 matstruct->mat = hybMat; 2557 2558 if (mat) { 2559 if (mat->values) delete (THRUSTARRAY *)mat->values; 2560 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2561 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2562 delete (CsrMatrix *)mat; 2563 } 2564 #endif 2565 } 2566 2567 /* assign the compressed row indices */ 2568 if (a->compressedrow.use) { 2569 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2570 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2571 matstruct->cprowIndices->assign(ridx, ridx + m); 2572 tmp = m; 2573 } else { 2574 cusparsestruct->workVector = NULL; 2575 matstruct->cprowIndices = NULL; 2576 tmp = 0; 2577 } 2578 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2579 2580 /* assign the pointer */ 2581 cusparsestruct->mat = matstruct; 2582 } catch (char *ex) { 2583 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2584 } 2585 PetscCallCUDA(WaitForCUDA()); 2586 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2587 cusparsestruct->nonzerostate = A->nonzerostate; 2588 } 2589 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2590 } 2591 PetscFunctionReturn(PETSC_SUCCESS); 2592 } 2593 2594 struct VecCUDAPlusEquals { 2595 template <typename Tuple> 2596 __host__ __device__ void operator()(Tuple t) 2597 { 2598 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2599 } 2600 }; 2601 2602 struct VecCUDAEquals { 2603 template <typename Tuple> 2604 __host__ __device__ void operator()(Tuple t) 2605 { 2606 thrust::get<1>(t) = thrust::get<0>(t); 2607 } 2608 }; 2609 2610 struct VecCUDAEqualsReverse { 2611 template <typename Tuple> 2612 __host__ __device__ void operator()(Tuple t) 2613 { 2614 thrust::get<0>(t) = thrust::get<1>(t); 2615 } 2616 }; 2617 2618 struct MatMatCusparse { 2619 PetscBool cisdense; 2620 PetscScalar *Bt; 2621 Mat X; 2622 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2623 PetscLogDouble flops; 2624 CsrMatrix *Bcsr; 2625 2626 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2627 cusparseSpMatDescr_t matSpBDescr; 2628 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2629 cusparseDnMatDescr_t matBDescr; 2630 cusparseDnMatDescr_t matCDescr; 2631 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2632 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2633 void *dBuffer4; 2634 void *dBuffer5; 2635 #endif 2636 size_t mmBufferSize; 2637 void *mmBuffer; 2638 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2639 cusparseSpGEMMDescr_t spgemmDesc; 2640 #endif 2641 }; 2642 2643 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2644 { 2645 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2646 2647 PetscFunctionBegin; 2648 PetscCallCUDA(cudaFree(mmdata->Bt)); 2649 delete mmdata->Bcsr; 2650 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2651 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2652 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2653 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2654 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2655 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2656 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2657 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2658 #endif 2659 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2660 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2661 #endif 2662 PetscCall(MatDestroy(&mmdata->X)); 2663 PetscCall(PetscFree(data)); 2664 PetscFunctionReturn(PETSC_SUCCESS); 2665 } 2666 2667 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2668 2669 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2670 { 2671 Mat_Product *product = C->product; 2672 Mat A, B; 2673 PetscInt m, n, blda, clda; 2674 PetscBool flg, biscuda; 2675 Mat_SeqAIJCUSPARSE *cusp; 2676 cusparseStatus_t stat; 2677 cusparseOperation_t opA; 2678 const PetscScalar *barray; 2679 PetscScalar *carray; 2680 MatMatCusparse *mmdata; 2681 Mat_SeqAIJCUSPARSEMultStruct *mat; 2682 CsrMatrix *csrmat; 2683 2684 PetscFunctionBegin; 2685 MatCheckProduct(C, 1); 2686 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2687 mmdata = (MatMatCusparse *)product->data; 2688 A = product->A; 2689 B = product->B; 2690 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2691 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2692 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2693 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2694 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2695 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2696 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2697 switch (product->type) { 2698 case MATPRODUCT_AB: 2699 case MATPRODUCT_PtAP: 2700 mat = cusp->mat; 2701 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2702 m = A->rmap->n; 2703 n = B->cmap->n; 2704 break; 2705 case MATPRODUCT_AtB: 2706 if (!A->form_explicit_transpose) { 2707 mat = cusp->mat; 2708 opA = CUSPARSE_OPERATION_TRANSPOSE; 2709 } else { 2710 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2711 mat = cusp->matTranspose; 2712 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2713 } 2714 m = A->cmap->n; 2715 n = B->cmap->n; 2716 break; 2717 case MATPRODUCT_ABt: 2718 case MATPRODUCT_RARt: 2719 mat = cusp->mat; 2720 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2721 m = A->rmap->n; 2722 n = B->rmap->n; 2723 break; 2724 default: 2725 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2726 } 2727 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2728 csrmat = (CsrMatrix *)mat->mat; 2729 /* if the user passed a CPU matrix, copy the data to the GPU */ 2730 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2731 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2732 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2733 2734 PetscCall(MatDenseGetLDA(B, &blda)); 2735 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2736 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2737 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2738 } else { 2739 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2740 PetscCall(MatDenseGetLDA(C, &clda)); 2741 } 2742 2743 PetscCall(PetscLogGpuTimeBegin()); 2744 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2745 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2746 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2747 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2748 #else 2749 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2750 #endif 2751 2752 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2753 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2754 size_t mmBufferSize; 2755 if (mmdata->initialized && mmdata->Blda != blda) { 2756 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2757 mmdata->matBDescr = NULL; 2758 } 2759 if (!mmdata->matBDescr) { 2760 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2761 mmdata->Blda = blda; 2762 } 2763 2764 if (mmdata->initialized && mmdata->Clda != clda) { 2765 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2766 mmdata->matCDescr = NULL; 2767 } 2768 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2769 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2770 mmdata->Clda = clda; 2771 } 2772 2773 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2774 if (matADescr) { 2775 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2776 matADescr = NULL; 2777 } 2778 #endif 2779 2780 if (!matADescr) { 2781 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2782 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2783 PetscCallCUSPARSE(stat); 2784 } 2785 2786 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2787 2788 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2789 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2790 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2791 mmdata->mmBufferSize = mmBufferSize; 2792 } 2793 2794 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0 2795 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2796 #endif 2797 2798 mmdata->initialized = PETSC_TRUE; 2799 } else { 2800 /* to be safe, always update pointers of the mats */ 2801 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2802 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2803 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2804 } 2805 2806 /* do cusparseSpMM, which supports transpose on B */ 2807 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2808 #else 2809 PetscInt k; 2810 /* cusparseXcsrmm does not support transpose on B */ 2811 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2812 cublasHandle_t cublasv2handle; 2813 cublasStatus_t cerr; 2814 2815 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2816 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2817 PetscCallCUBLAS(cerr); 2818 blda = B->cmap->n; 2819 k = B->cmap->n; 2820 } else { 2821 k = B->rmap->n; 2822 } 2823 2824 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2825 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2826 PetscCallCUSPARSE(stat); 2827 #endif 2828 PetscCall(PetscLogGpuTimeEnd()); 2829 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2830 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2831 if (product->type == MATPRODUCT_RARt) { 2832 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2833 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2834 } else if (product->type == MATPRODUCT_PtAP) { 2835 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2836 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2837 } else { 2838 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2839 } 2840 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2841 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2842 PetscFunctionReturn(PETSC_SUCCESS); 2843 } 2844 2845 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2846 { 2847 Mat_Product *product = C->product; 2848 Mat A, B; 2849 PetscInt m, n; 2850 PetscBool cisdense, flg; 2851 MatMatCusparse *mmdata; 2852 Mat_SeqAIJCUSPARSE *cusp; 2853 2854 PetscFunctionBegin; 2855 MatCheckProduct(C, 1); 2856 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2857 A = product->A; 2858 B = product->B; 2859 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2860 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2861 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2862 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2863 switch (product->type) { 2864 case MATPRODUCT_AB: 2865 m = A->rmap->n; 2866 n = B->cmap->n; 2867 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2868 break; 2869 case MATPRODUCT_AtB: 2870 m = A->cmap->n; 2871 n = B->cmap->n; 2872 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 2873 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2874 break; 2875 case MATPRODUCT_ABt: 2876 m = A->rmap->n; 2877 n = B->rmap->n; 2878 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 2879 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2880 break; 2881 case MATPRODUCT_PtAP: 2882 m = B->cmap->n; 2883 n = B->cmap->n; 2884 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 2885 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2886 break; 2887 case MATPRODUCT_RARt: 2888 m = B->rmap->n; 2889 n = B->rmap->n; 2890 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 2891 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2892 break; 2893 default: 2894 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2895 } 2896 PetscCall(MatSetSizes(C, m, n, m, n)); 2897 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2898 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2899 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2900 2901 /* product data */ 2902 PetscCall(PetscNew(&mmdata)); 2903 mmdata->cisdense = cisdense; 2904 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2905 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2906 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2907 #endif 2908 /* for these products we need intermediate storage */ 2909 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2910 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2911 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2912 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2913 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2914 } else { 2915 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2916 } 2917 } 2918 C->product->data = mmdata; 2919 C->product->destroy = MatDestroy_MatMatCusparse; 2920 2921 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2922 PetscFunctionReturn(PETSC_SUCCESS); 2923 } 2924 2925 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2926 { 2927 Mat_Product *product = C->product; 2928 Mat A, B; 2929 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2930 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2931 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2932 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2933 PetscBool flg; 2934 cusparseStatus_t stat; 2935 MatProductType ptype; 2936 MatMatCusparse *mmdata; 2937 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2938 cusparseSpMatDescr_t BmatSpDescr; 2939 #endif 2940 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2941 2942 PetscFunctionBegin; 2943 MatCheckProduct(C, 1); 2944 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2945 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2946 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2947 mmdata = (MatMatCusparse *)C->product->data; 2948 A = product->A; 2949 B = product->B; 2950 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2951 mmdata->reusesym = PETSC_FALSE; 2952 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2953 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2954 Cmat = Ccusp->mat; 2955 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2956 Ccsr = (CsrMatrix *)Cmat->mat; 2957 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2958 goto finalize; 2959 } 2960 if (!c->nz) goto finalize; 2961 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2962 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2963 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2964 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2965 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2966 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2967 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2968 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2969 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2970 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2971 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2972 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2973 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2974 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2975 2976 ptype = product->type; 2977 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2978 ptype = MATPRODUCT_AB; 2979 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2980 } 2981 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2982 ptype = MATPRODUCT_AB; 2983 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2984 } 2985 switch (ptype) { 2986 case MATPRODUCT_AB: 2987 Amat = Acusp->mat; 2988 Bmat = Bcusp->mat; 2989 break; 2990 case MATPRODUCT_AtB: 2991 Amat = Acusp->matTranspose; 2992 Bmat = Bcusp->mat; 2993 break; 2994 case MATPRODUCT_ABt: 2995 Amat = Acusp->mat; 2996 Bmat = Bcusp->matTranspose; 2997 break; 2998 default: 2999 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3000 } 3001 Cmat = Ccusp->mat; 3002 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3003 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3004 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 3005 Acsr = (CsrMatrix *)Amat->mat; 3006 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 3007 Ccsr = (CsrMatrix *)Cmat->mat; 3008 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3009 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3010 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 3011 PetscCall(PetscLogGpuTimeBegin()); 3012 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3013 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 3014 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3015 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3016 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3017 PetscCallCUSPARSE(stat); 3018 #else 3019 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3020 PetscCallCUSPARSE(stat); 3021 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3022 PetscCallCUSPARSE(stat); 3023 #endif 3024 #else 3025 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3026 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3027 PetscCallCUSPARSE(stat); 3028 #endif 3029 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3030 PetscCallCUDA(WaitForCUDA()); 3031 PetscCall(PetscLogGpuTimeEnd()); 3032 C->offloadmask = PETSC_OFFLOAD_GPU; 3033 finalize: 3034 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3035 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3036 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3037 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3038 c->reallocs = 0; 3039 C->info.mallocs += 0; 3040 C->info.nz_unneeded = 0; 3041 C->assembled = C->was_assembled = PETSC_TRUE; 3042 C->num_ass++; 3043 PetscFunctionReturn(PETSC_SUCCESS); 3044 } 3045 3046 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3047 { 3048 Mat_Product *product = C->product; 3049 Mat A, B; 3050 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3051 Mat_SeqAIJ *a, *b, *c; 3052 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3053 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3054 PetscInt i, j, m, n, k; 3055 PetscBool flg; 3056 cusparseStatus_t stat; 3057 MatProductType ptype; 3058 MatMatCusparse *mmdata; 3059 PetscLogDouble flops; 3060 PetscBool biscompressed, ciscompressed; 3061 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3062 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3063 cusparseSpMatDescr_t BmatSpDescr; 3064 #else 3065 int cnz; 3066 #endif 3067 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3068 3069 PetscFunctionBegin; 3070 MatCheckProduct(C, 1); 3071 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3072 A = product->A; 3073 B = product->B; 3074 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3075 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3076 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3077 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3078 a = (Mat_SeqAIJ *)A->data; 3079 b = (Mat_SeqAIJ *)B->data; 3080 /* product data */ 3081 PetscCall(PetscNew(&mmdata)); 3082 C->product->data = mmdata; 3083 C->product->destroy = MatDestroy_MatMatCusparse; 3084 3085 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3086 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3087 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3088 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3089 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3090 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3091 3092 ptype = product->type; 3093 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3094 ptype = MATPRODUCT_AB; 3095 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3096 } 3097 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3098 ptype = MATPRODUCT_AB; 3099 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3100 } 3101 biscompressed = PETSC_FALSE; 3102 ciscompressed = PETSC_FALSE; 3103 switch (ptype) { 3104 case MATPRODUCT_AB: 3105 m = A->rmap->n; 3106 n = B->cmap->n; 3107 k = A->cmap->n; 3108 Amat = Acusp->mat; 3109 Bmat = Bcusp->mat; 3110 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3111 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3112 break; 3113 case MATPRODUCT_AtB: 3114 m = A->cmap->n; 3115 n = B->cmap->n; 3116 k = A->rmap->n; 3117 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3118 Amat = Acusp->matTranspose; 3119 Bmat = Bcusp->mat; 3120 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3121 break; 3122 case MATPRODUCT_ABt: 3123 m = A->rmap->n; 3124 n = B->rmap->n; 3125 k = A->cmap->n; 3126 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3127 Amat = Acusp->mat; 3128 Bmat = Bcusp->matTranspose; 3129 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3130 break; 3131 default: 3132 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3133 } 3134 3135 /* create cusparse matrix */ 3136 PetscCall(MatSetSizes(C, m, n, m, n)); 3137 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3138 c = (Mat_SeqAIJ *)C->data; 3139 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3140 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3141 Ccsr = new CsrMatrix; 3142 3143 c->compressedrow.use = ciscompressed; 3144 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3145 c->compressedrow.nrows = a->compressedrow.nrows; 3146 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3147 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3148 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3149 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3150 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3151 } else { 3152 c->compressedrow.nrows = 0; 3153 c->compressedrow.i = NULL; 3154 c->compressedrow.rindex = NULL; 3155 Ccusp->workVector = NULL; 3156 Cmat->cprowIndices = NULL; 3157 } 3158 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3159 Ccusp->mat = Cmat; 3160 Ccusp->mat->mat = Ccsr; 3161 Ccsr->num_rows = Ccusp->nrows; 3162 Ccsr->num_cols = n; 3163 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3164 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3165 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3166 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3167 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3168 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3169 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3170 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3171 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3172 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3173 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3174 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3175 c->nz = 0; 3176 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3177 Ccsr->values = new THRUSTARRAY(c->nz); 3178 goto finalizesym; 3179 } 3180 3181 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3182 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3183 Acsr = (CsrMatrix *)Amat->mat; 3184 if (!biscompressed) { 3185 Bcsr = (CsrMatrix *)Bmat->mat; 3186 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3187 BmatSpDescr = Bmat->matDescr; 3188 #endif 3189 } else { /* we need to use row offsets for the full matrix */ 3190 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3191 Bcsr = new CsrMatrix; 3192 Bcsr->num_rows = B->rmap->n; 3193 Bcsr->num_cols = cBcsr->num_cols; 3194 Bcsr->num_entries = cBcsr->num_entries; 3195 Bcsr->column_indices = cBcsr->column_indices; 3196 Bcsr->values = cBcsr->values; 3197 if (!Bcusp->rowoffsets_gpu) { 3198 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3199 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3200 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3201 } 3202 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3203 mmdata->Bcsr = Bcsr; 3204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3205 if (Bcsr->num_rows && Bcsr->num_cols) { 3206 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3207 PetscCallCUSPARSE(stat); 3208 } 3209 BmatSpDescr = mmdata->matSpBDescr; 3210 #endif 3211 } 3212 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3213 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3214 /* precompute flops count */ 3215 if (ptype == MATPRODUCT_AB) { 3216 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3217 const PetscInt st = a->i[i]; 3218 const PetscInt en = a->i[i + 1]; 3219 for (j = st; j < en; j++) { 3220 const PetscInt brow = a->j[j]; 3221 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3222 } 3223 } 3224 } else if (ptype == MATPRODUCT_AtB) { 3225 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3226 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3227 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3228 flops += (2. * anzi) * bnzi; 3229 } 3230 } else { /* TODO */ 3231 flops = 0.; 3232 } 3233 3234 mmdata->flops = flops; 3235 PetscCall(PetscLogGpuTimeBegin()); 3236 3237 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3238 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3239 // cuda-12.2 requires non-null csrRowOffsets 3240 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3241 PetscCallCUSPARSE(stat); 3242 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3243 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3244 { 3245 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3246 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3247 */ 3248 void *dBuffer1 = NULL; 3249 void *dBuffer2 = NULL; 3250 void *dBuffer3 = NULL; 3251 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3252 size_t bufferSize1 = 0; 3253 size_t bufferSize2 = 0; 3254 size_t bufferSize3 = 0; 3255 size_t bufferSize4 = 0; 3256 size_t bufferSize5 = 0; 3257 3258 /* ask bufferSize1 bytes for external memory */ 3259 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3260 PetscCallCUSPARSE(stat); 3261 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3262 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3263 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3264 PetscCallCUSPARSE(stat); 3265 3266 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3267 PetscCallCUSPARSE(stat); 3268 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3269 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3270 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3271 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3272 PetscCallCUSPARSE(stat); 3273 PetscCallCUDA(cudaFree(dBuffer1)); 3274 PetscCallCUDA(cudaFree(dBuffer2)); 3275 3276 /* get matrix C non-zero entries C_nnz1 */ 3277 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3278 c->nz = (PetscInt)C_nnz1; 3279 /* allocate matrix C */ 3280 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3281 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3282 Ccsr->values = new THRUSTARRAY(c->nz); 3283 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3284 /* update matC with the new pointers */ 3285 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3286 PetscCallCUSPARSE(stat); 3287 3288 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3289 PetscCallCUSPARSE(stat); 3290 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3291 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3292 PetscCallCUSPARSE(stat); 3293 PetscCallCUDA(cudaFree(dBuffer3)); 3294 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3295 PetscCallCUSPARSE(stat); 3296 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3297 } 3298 #else 3299 size_t bufSize2; 3300 /* ask bufferSize bytes for external memory */ 3301 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3302 PetscCallCUSPARSE(stat); 3303 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3304 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3305 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3306 PetscCallCUSPARSE(stat); 3307 /* ask bufferSize again bytes for external memory */ 3308 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3309 PetscCallCUSPARSE(stat); 3310 /* The CUSPARSE documentation is not clear, nor the API 3311 We need both buffers to perform the operations properly! 3312 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3313 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3314 is stored in the descriptor! What a messy API... */ 3315 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3316 /* compute the intermediate product of A * B */ 3317 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3318 PetscCallCUSPARSE(stat); 3319 /* get matrix C non-zero entries C_nnz1 */ 3320 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3321 c->nz = (PetscInt)C_nnz1; 3322 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3323 mmdata->mmBufferSize / 1024)); 3324 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3325 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3326 Ccsr->values = new THRUSTARRAY(c->nz); 3327 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3328 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3329 PetscCallCUSPARSE(stat); 3330 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3331 PetscCallCUSPARSE(stat); 3332 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3333 #else 3334 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3335 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3336 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3337 PetscCallCUSPARSE(stat); 3338 c->nz = cnz; 3339 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3340 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3341 Ccsr->values = new THRUSTARRAY(c->nz); 3342 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3343 3344 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3345 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3346 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3347 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3348 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3349 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3350 PetscCallCUSPARSE(stat); 3351 #endif 3352 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3353 PetscCall(PetscLogGpuTimeEnd()); 3354 finalizesym: 3355 c->free_a = PETSC_TRUE; 3356 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3357 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3358 c->free_ij = PETSC_TRUE; 3359 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3360 PetscInt *d_i = c->i; 3361 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3362 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3363 ii = *Ccsr->row_offsets; 3364 jj = *Ccsr->column_indices; 3365 if (ciscompressed) d_i = c->compressedrow.i; 3366 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3367 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3368 } else { 3369 PetscInt *d_i = c->i; 3370 if (ciscompressed) d_i = c->compressedrow.i; 3371 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3372 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3373 } 3374 if (ciscompressed) { /* need to expand host row offsets */ 3375 PetscInt r = 0; 3376 c->i[0] = 0; 3377 for (k = 0; k < c->compressedrow.nrows; k++) { 3378 const PetscInt next = c->compressedrow.rindex[k]; 3379 const PetscInt old = c->compressedrow.i[k]; 3380 for (; r < next; r++) c->i[r + 1] = old; 3381 } 3382 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3383 } 3384 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3385 PetscCall(PetscMalloc1(m, &c->ilen)); 3386 PetscCall(PetscMalloc1(m, &c->imax)); 3387 c->maxnz = c->nz; 3388 c->nonzerorowcnt = 0; 3389 c->rmax = 0; 3390 for (k = 0; k < m; k++) { 3391 const PetscInt nn = c->i[k + 1] - c->i[k]; 3392 c->ilen[k] = c->imax[k] = nn; 3393 c->nonzerorowcnt += (PetscInt)!!nn; 3394 c->rmax = PetscMax(c->rmax, nn); 3395 } 3396 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3397 PetscCall(PetscMalloc1(c->nz, &c->a)); 3398 Ccsr->num_entries = c->nz; 3399 3400 C->nonzerostate++; 3401 PetscCall(PetscLayoutSetUp(C->rmap)); 3402 PetscCall(PetscLayoutSetUp(C->cmap)); 3403 Ccusp->nonzerostate = C->nonzerostate; 3404 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3405 C->preallocated = PETSC_TRUE; 3406 C->assembled = PETSC_FALSE; 3407 C->was_assembled = PETSC_FALSE; 3408 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3409 mmdata->reusesym = PETSC_TRUE; 3410 C->offloadmask = PETSC_OFFLOAD_GPU; 3411 } 3412 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3413 PetscFunctionReturn(PETSC_SUCCESS); 3414 } 3415 3416 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3417 3418 /* handles sparse or dense B */ 3419 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3420 { 3421 Mat_Product *product = mat->product; 3422 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3423 3424 PetscFunctionBegin; 3425 MatCheckProduct(mat, 1); 3426 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3427 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3428 if (product->type == MATPRODUCT_ABC) { 3429 Ciscusp = PETSC_FALSE; 3430 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3431 } 3432 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3433 PetscBool usecpu = PETSC_FALSE; 3434 switch (product->type) { 3435 case MATPRODUCT_AB: 3436 if (product->api_user) { 3437 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3438 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3439 PetscOptionsEnd(); 3440 } else { 3441 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3442 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3443 PetscOptionsEnd(); 3444 } 3445 break; 3446 case MATPRODUCT_AtB: 3447 if (product->api_user) { 3448 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3449 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3450 PetscOptionsEnd(); 3451 } else { 3452 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3453 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3454 PetscOptionsEnd(); 3455 } 3456 break; 3457 case MATPRODUCT_PtAP: 3458 if (product->api_user) { 3459 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3460 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3461 PetscOptionsEnd(); 3462 } else { 3463 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3464 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3465 PetscOptionsEnd(); 3466 } 3467 break; 3468 case MATPRODUCT_RARt: 3469 if (product->api_user) { 3470 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3471 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3472 PetscOptionsEnd(); 3473 } else { 3474 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3475 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3476 PetscOptionsEnd(); 3477 } 3478 break; 3479 case MATPRODUCT_ABC: 3480 if (product->api_user) { 3481 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3482 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3483 PetscOptionsEnd(); 3484 } else { 3485 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3486 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3487 PetscOptionsEnd(); 3488 } 3489 break; 3490 default: 3491 break; 3492 } 3493 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3494 } 3495 /* dispatch */ 3496 if (isdense) { 3497 switch (product->type) { 3498 case MATPRODUCT_AB: 3499 case MATPRODUCT_AtB: 3500 case MATPRODUCT_ABt: 3501 case MATPRODUCT_PtAP: 3502 case MATPRODUCT_RARt: 3503 if (product->A->boundtocpu) { 3504 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3505 } else { 3506 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3507 } 3508 break; 3509 case MATPRODUCT_ABC: 3510 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3511 break; 3512 default: 3513 break; 3514 } 3515 } else if (Biscusp && Ciscusp) { 3516 switch (product->type) { 3517 case MATPRODUCT_AB: 3518 case MATPRODUCT_AtB: 3519 case MATPRODUCT_ABt: 3520 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3521 break; 3522 case MATPRODUCT_PtAP: 3523 case MATPRODUCT_RARt: 3524 case MATPRODUCT_ABC: 3525 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3526 break; 3527 default: 3528 break; 3529 } 3530 } else { /* fallback for AIJ */ 3531 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3532 } 3533 PetscFunctionReturn(PETSC_SUCCESS); 3534 } 3535 3536 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3537 { 3538 PetscFunctionBegin; 3539 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3540 PetscFunctionReturn(PETSC_SUCCESS); 3541 } 3542 3543 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3544 { 3545 PetscFunctionBegin; 3546 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3547 PetscFunctionReturn(PETSC_SUCCESS); 3548 } 3549 3550 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3551 { 3552 PetscFunctionBegin; 3553 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3554 PetscFunctionReturn(PETSC_SUCCESS); 3555 } 3556 3557 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3558 { 3559 PetscFunctionBegin; 3560 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3561 PetscFunctionReturn(PETSC_SUCCESS); 3562 } 3563 3564 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3565 { 3566 PetscFunctionBegin; 3567 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3568 PetscFunctionReturn(PETSC_SUCCESS); 3569 } 3570 3571 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3572 { 3573 int i = blockIdx.x * blockDim.x + threadIdx.x; 3574 if (i < n) y[idx[i]] += x[i]; 3575 } 3576 3577 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3578 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3579 { 3580 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3581 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3582 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3583 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3584 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3585 PetscBool compressed; 3586 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3587 PetscInt nx, ny; 3588 #endif 3589 3590 PetscFunctionBegin; 3591 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3592 if (!a->nz) { 3593 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3594 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3595 PetscFunctionReturn(PETSC_SUCCESS); 3596 } 3597 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3598 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3599 if (!trans) { 3600 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3601 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3602 } else { 3603 if (herm || !A->form_explicit_transpose) { 3604 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3605 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3606 } else { 3607 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3608 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3609 } 3610 } 3611 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3612 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3613 3614 try { 3615 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3616 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3617 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3618 3619 PetscCall(PetscLogGpuTimeBegin()); 3620 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3621 /* z = A x + beta y. 3622 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3623 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3624 */ 3625 xptr = xarray; 3626 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3627 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3628 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3629 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3630 allocated to accommodate different uses. So we get the length info directly from mat. 3631 */ 3632 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3633 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3634 nx = mat->num_cols; // since y = Ax 3635 ny = mat->num_rows; 3636 } 3637 #endif 3638 } else { 3639 /* z = A^T x + beta y 3640 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3641 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3642 */ 3643 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3644 dptr = zarray; 3645 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3646 if (compressed) { /* Scatter x to work vector */ 3647 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3648 3649 thrust::for_each( 3650 #if PetscDefined(HAVE_THRUST_ASYNC) 3651 thrust::cuda::par.on(PetscDefaultCudaStream), 3652 #endif 3653 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3654 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3655 } 3656 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3657 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3658 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3659 nx = mat->num_rows; // since y = A^T x 3660 ny = mat->num_cols; 3661 } 3662 #endif 3663 } 3664 3665 /* csr_spmv does y = alpha op(A) x + beta y */ 3666 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3667 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3668 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3669 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3670 #else 3671 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3672 #endif 3673 3674 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3675 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3676 if (!matDescr) { 3677 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3678 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3679 } 3680 #endif 3681 3682 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3683 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3684 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3685 PetscCallCUSPARSE( 3686 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3687 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3688 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3689 PetscCallCUSPARSE( 3690 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3691 #endif 3692 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3693 } else { 3694 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3695 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3696 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3697 } 3698 3699 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3700 #else 3701 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3702 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3703 #endif 3704 } else { 3705 if (cusparsestruct->nrows) { 3706 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3707 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3708 #else 3709 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3710 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3711 #endif 3712 } 3713 } 3714 PetscCall(PetscLogGpuTimeEnd()); 3715 3716 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3717 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3718 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3719 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3720 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3721 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3722 } 3723 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3724 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3725 } 3726 3727 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3728 if (compressed) { 3729 PetscCall(PetscLogGpuTimeBegin()); 3730 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3731 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3732 PetscCall(PetscLogGpuTimeEnd()); 3733 } 3734 } else { 3735 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3736 } 3737 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3738 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3739 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3740 } catch (char *ex) { 3741 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3742 } 3743 if (yy) { 3744 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3745 } else { 3746 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3747 } 3748 PetscFunctionReturn(PETSC_SUCCESS); 3749 } 3750 3751 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3752 { 3753 PetscFunctionBegin; 3754 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3755 PetscFunctionReturn(PETSC_SUCCESS); 3756 } 3757 3758 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3759 { 3760 PetscFunctionBegin; 3761 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3762 PetscFunctionReturn(PETSC_SUCCESS); 3763 } 3764 3765 /*@ 3766 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3767 (the default parallel PETSc format). 3768 3769 Collective 3770 3771 Input Parameters: 3772 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3773 . m - number of rows 3774 . n - number of columns 3775 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3776 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3777 3778 Output Parameter: 3779 . A - the matrix 3780 3781 Level: intermediate 3782 3783 Notes: 3784 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3785 calculations. For good matrix assembly performance the user should preallocate the matrix 3786 storage by setting the parameter `nz` (or the array `nnz`). 3787 3788 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3789 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3790 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3791 3792 The AIJ format, also called 3793 compressed row storage, is fully compatible with standard Fortran 3794 storage. That is, the stored row and column indices can begin at 3795 either one (as in Fortran) or zero. 3796 3797 Specify the preallocated storage with either nz or nnz (not both). 3798 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3799 allocation. 3800 3801 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 3802 @*/ 3803 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3804 { 3805 PetscFunctionBegin; 3806 PetscCall(MatCreate(comm, A)); 3807 PetscCall(MatSetSizes(*A, m, n, m, n)); 3808 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3809 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3810 PetscFunctionReturn(PETSC_SUCCESS); 3811 } 3812 3813 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3814 { 3815 PetscFunctionBegin; 3816 if (A->factortype == MAT_FACTOR_NONE) { 3817 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3818 } else { 3819 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3820 } 3821 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3822 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3823 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3824 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3825 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3826 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3827 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3828 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3829 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3830 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3831 PetscCall(MatDestroy_SeqAIJ(A)); 3832 PetscFunctionReturn(PETSC_SUCCESS); 3833 } 3834 3835 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3836 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3837 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3838 { 3839 PetscFunctionBegin; 3840 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3841 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3842 PetscFunctionReturn(PETSC_SUCCESS); 3843 } 3844 3845 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3846 { 3847 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3848 Mat_SeqAIJCUSPARSE *cy; 3849 Mat_SeqAIJCUSPARSE *cx; 3850 PetscScalar *ay; 3851 const PetscScalar *ax; 3852 CsrMatrix *csry, *csrx; 3853 3854 PetscFunctionBegin; 3855 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3856 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3857 if (X->ops->axpy != Y->ops->axpy) { 3858 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3859 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3860 PetscFunctionReturn(PETSC_SUCCESS); 3861 } 3862 /* if we are here, it means both matrices are bound to GPU */ 3863 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3864 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3865 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3866 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3867 csry = (CsrMatrix *)cy->mat->mat; 3868 csrx = (CsrMatrix *)cx->mat->mat; 3869 /* see if we can turn this into a cublas axpy */ 3870 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3871 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3872 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3873 if (eq) str = SAME_NONZERO_PATTERN; 3874 } 3875 /* spgeam is buggy with one column */ 3876 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3877 3878 if (str == SUBSET_NONZERO_PATTERN) { 3879 PetscScalar b = 1.0; 3880 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3881 size_t bufferSize; 3882 void *buffer; 3883 #endif 3884 3885 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3886 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3887 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3888 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3889 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3890 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3891 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3892 PetscCall(PetscLogGpuTimeBegin()); 3893 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3894 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3895 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3896 PetscCall(PetscLogGpuTimeEnd()); 3897 PetscCallCUDA(cudaFree(buffer)); 3898 #else 3899 PetscCall(PetscLogGpuTimeBegin()); 3900 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3901 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3902 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3903 PetscCall(PetscLogGpuTimeEnd()); 3904 #endif 3905 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3906 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3907 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3908 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3909 } else if (str == SAME_NONZERO_PATTERN) { 3910 cublasHandle_t cublasv2handle; 3911 PetscBLASInt one = 1, bnz = 1; 3912 3913 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3914 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3915 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3916 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3917 PetscCall(PetscLogGpuTimeBegin()); 3918 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3919 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3920 PetscCall(PetscLogGpuTimeEnd()); 3921 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3922 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3923 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3924 } else { 3925 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3926 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3927 } 3928 PetscFunctionReturn(PETSC_SUCCESS); 3929 } 3930 3931 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3932 { 3933 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3934 PetscScalar *ay; 3935 cublasHandle_t cublasv2handle; 3936 PetscBLASInt one = 1, bnz = 1; 3937 3938 PetscFunctionBegin; 3939 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3940 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3941 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3942 PetscCall(PetscLogGpuTimeBegin()); 3943 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3944 PetscCall(PetscLogGpuFlops(bnz)); 3945 PetscCall(PetscLogGpuTimeEnd()); 3946 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3947 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3948 PetscFunctionReturn(PETSC_SUCCESS); 3949 } 3950 3951 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3952 { 3953 PetscBool both = PETSC_FALSE; 3954 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3955 3956 PetscFunctionBegin; 3957 if (A->factortype == MAT_FACTOR_NONE) { 3958 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3959 if (spptr->mat) { 3960 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3961 if (matrix->values) { 3962 both = PETSC_TRUE; 3963 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3964 } 3965 } 3966 if (spptr->matTranspose) { 3967 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3968 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3969 } 3970 } 3971 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3972 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3973 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3974 else A->offloadmask = PETSC_OFFLOAD_CPU; 3975 PetscFunctionReturn(PETSC_SUCCESS); 3976 } 3977 3978 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m) 3979 { 3980 PetscFunctionBegin; 3981 *m = PETSC_MEMTYPE_CUDA; 3982 PetscFunctionReturn(PETSC_SUCCESS); 3983 } 3984 3985 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3986 { 3987 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3988 3989 PetscFunctionBegin; 3990 if (A->factortype != MAT_FACTOR_NONE) { 3991 A->boundtocpu = flg; 3992 PetscFunctionReturn(PETSC_SUCCESS); 3993 } 3994 if (flg) { 3995 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3996 3997 A->ops->scale = MatScale_SeqAIJ; 3998 A->ops->axpy = MatAXPY_SeqAIJ; 3999 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4000 A->ops->mult = MatMult_SeqAIJ; 4001 A->ops->multadd = MatMultAdd_SeqAIJ; 4002 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4003 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4004 A->ops->multhermitiantranspose = NULL; 4005 A->ops->multhermitiantransposeadd = NULL; 4006 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4007 A->ops->getcurrentmemtype = NULL; 4008 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 4009 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 4010 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 4011 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 4012 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 4013 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 4014 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 4015 } else { 4016 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4017 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4018 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4019 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4020 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4021 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4022 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4023 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4024 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4025 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4026 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4027 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4028 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4029 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4030 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4031 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4032 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4033 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4034 4035 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4036 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4037 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4038 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4039 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4040 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4041 } 4042 A->boundtocpu = flg; 4043 if (flg && a->inode.size_csr) { 4044 a->inode.use = PETSC_TRUE; 4045 } else { 4046 a->inode.use = PETSC_FALSE; 4047 } 4048 PetscFunctionReturn(PETSC_SUCCESS); 4049 } 4050 4051 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4052 { 4053 Mat B; 4054 4055 PetscFunctionBegin; 4056 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4057 if (reuse == MAT_INITIAL_MATRIX) { 4058 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4059 } else if (reuse == MAT_REUSE_MATRIX) { 4060 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4061 } 4062 B = *newmat; 4063 4064 PetscCall(PetscFree(B->defaultvectype)); 4065 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4066 4067 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4068 if (B->factortype == MAT_FACTOR_NONE) { 4069 Mat_SeqAIJCUSPARSE *spptr; 4070 PetscCall(PetscNew(&spptr)); 4071 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4072 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4073 spptr->format = MAT_CUSPARSE_CSR; 4074 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4075 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4076 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4077 #else 4078 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4079 #endif 4080 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4081 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4082 #endif 4083 B->spptr = spptr; 4084 } else { 4085 Mat_SeqAIJCUSPARSETriFactors *spptr; 4086 4087 PetscCall(PetscNew(&spptr)); 4088 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4089 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4090 B->spptr = spptr; 4091 } 4092 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4093 } 4094 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4095 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4096 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4097 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4098 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4099 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4100 B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4101 4102 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4103 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4104 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4105 #if defined(PETSC_HAVE_HYPRE) 4106 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4107 #endif 4108 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4109 PetscFunctionReturn(PETSC_SUCCESS); 4110 } 4111 4112 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4113 { 4114 PetscFunctionBegin; 4115 PetscCall(MatCreate_SeqAIJ(B)); 4116 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4117 PetscFunctionReturn(PETSC_SUCCESS); 4118 } 4119 4120 /*MC 4121 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4122 4123 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 4124 CSR, ELL, or Hybrid format. 4125 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4126 4127 Options Database Keys: 4128 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4129 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4130 Other options include ell (ellpack) or hyb (hybrid). 4131 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4132 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4133 4134 Level: beginner 4135 4136 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4137 M*/ 4138 4139 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4140 { 4141 PetscFunctionBegin; 4142 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4143 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4144 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4145 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4146 PetscFunctionReturn(PETSC_SUCCESS); 4147 } 4148 4149 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4150 { 4151 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4152 4153 PetscFunctionBegin; 4154 if (cusp) { 4155 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4156 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4157 delete cusp->workVector; 4158 delete cusp->rowoffsets_gpu; 4159 delete cusp->csr2csc_i; 4160 delete cusp->coords; 4161 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4162 PetscCall(PetscFree(mat->spptr)); 4163 } 4164 PetscFunctionReturn(PETSC_SUCCESS); 4165 } 4166 4167 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4168 { 4169 PetscFunctionBegin; 4170 if (*mat) { 4171 delete (*mat)->values; 4172 delete (*mat)->column_indices; 4173 delete (*mat)->row_offsets; 4174 delete *mat; 4175 *mat = 0; 4176 } 4177 PetscFunctionReturn(PETSC_SUCCESS); 4178 } 4179 4180 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4181 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4182 { 4183 PetscFunctionBegin; 4184 if (*trifactor) { 4185 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4186 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4187 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4188 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4189 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4190 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4191 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4192 #endif 4193 PetscCall(PetscFree(*trifactor)); 4194 } 4195 PetscFunctionReturn(PETSC_SUCCESS); 4196 } 4197 #endif 4198 4199 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4200 { 4201 CsrMatrix *mat; 4202 4203 PetscFunctionBegin; 4204 if (*matstruct) { 4205 if ((*matstruct)->mat) { 4206 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4207 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4208 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4209 #else 4210 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4211 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4212 #endif 4213 } else { 4214 mat = (CsrMatrix *)(*matstruct)->mat; 4215 PetscCall(CsrMatrix_Destroy(&mat)); 4216 } 4217 } 4218 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4219 delete (*matstruct)->cprowIndices; 4220 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4221 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4222 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4223 4224 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4225 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4226 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4227 4228 for (int i = 0; i < 3; i++) { 4229 if (mdata->cuSpMV[i].initialized) { 4230 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4231 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4232 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4233 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4234 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4235 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4236 #endif 4237 } 4238 } 4239 #endif 4240 delete *matstruct; 4241 *matstruct = NULL; 4242 } 4243 PetscFunctionReturn(PETSC_SUCCESS); 4244 } 4245 4246 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4247 { 4248 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4249 4250 PetscFunctionBegin; 4251 if (fs) { 4252 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4253 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4254 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4255 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4256 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4257 delete fs->workVector; 4258 fs->workVector = NULL; 4259 #endif 4260 delete fs->rpermIndices; 4261 delete fs->cpermIndices; 4262 fs->rpermIndices = NULL; 4263 fs->cpermIndices = NULL; 4264 fs->init_dev_prop = PETSC_FALSE; 4265 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4266 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4267 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4268 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4269 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4270 PetscCallCUDA(cudaFree(fs->csrVal)); 4271 PetscCallCUDA(cudaFree(fs->diag)); 4272 PetscCallCUDA(cudaFree(fs->X)); 4273 PetscCallCUDA(cudaFree(fs->Y)); 4274 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4275 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4276 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4277 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4278 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4279 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4280 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4281 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4282 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4283 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4284 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4285 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4286 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4287 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4288 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4289 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4290 PetscCall(PetscFree(fs->csrRowPtr_h)); 4291 PetscCall(PetscFree(fs->csrVal_h)); 4292 PetscCall(PetscFree(fs->diag_h)); 4293 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4294 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4295 #endif 4296 } 4297 PetscFunctionReturn(PETSC_SUCCESS); 4298 } 4299 4300 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4301 { 4302 PetscFunctionBegin; 4303 if (*trifactors) { 4304 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4305 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4306 PetscCall(PetscFree(*trifactors)); 4307 } 4308 PetscFunctionReturn(PETSC_SUCCESS); 4309 } 4310 4311 struct IJCompare { 4312 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4313 { 4314 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4315 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4316 return false; 4317 } 4318 }; 4319 4320 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4321 { 4322 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4323 4324 PetscFunctionBegin; 4325 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4326 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4327 if (destroy) { 4328 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4329 delete cusp->csr2csc_i; 4330 cusp->csr2csc_i = NULL; 4331 } 4332 A->transupdated = PETSC_FALSE; 4333 PetscFunctionReturn(PETSC_SUCCESS); 4334 } 4335 4336 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4337 { 4338 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 4339 4340 PetscFunctionBegin; 4341 PetscCallCUDA(cudaFree(coo->perm)); 4342 PetscCallCUDA(cudaFree(coo->jmap)); 4343 PetscCall(PetscFree(coo)); 4344 PetscFunctionReturn(PETSC_SUCCESS); 4345 } 4346 4347 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4348 { 4349 PetscBool dev_ij = PETSC_FALSE; 4350 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4351 PetscInt *i, *j; 4352 PetscContainer container_h; 4353 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4354 4355 PetscFunctionBegin; 4356 PetscCall(PetscGetMemType(coo_i, &mtype)); 4357 if (PetscMemTypeDevice(mtype)) { 4358 dev_ij = PETSC_TRUE; 4359 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4360 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4361 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4362 } else { 4363 i = coo_i; 4364 j = coo_j; 4365 } 4366 4367 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4368 if (dev_ij) PetscCall(PetscFree2(i, j)); 4369 mat->offloadmask = PETSC_OFFLOAD_CPU; 4370 // Create the GPU memory 4371 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4372 4373 // Copy the COO struct to device 4374 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4375 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4376 PetscCall(PetscMalloc1(1, &coo_d)); 4377 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4378 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4379 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4380 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4381 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4382 4383 // Put the COO struct in a container and then attach that to the matrix 4384 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4385 PetscFunctionReturn(PETSC_SUCCESS); 4386 } 4387 4388 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4389 { 4390 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4391 const PetscCount grid_size = gridDim.x * blockDim.x; 4392 for (; i < nnz; i += grid_size) { 4393 PetscScalar sum = 0.0; 4394 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4395 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4396 } 4397 } 4398 4399 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4400 { 4401 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4402 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4403 PetscCount Annz = seq->nz; 4404 PetscMemType memtype; 4405 const PetscScalar *v1 = v; 4406 PetscScalar *Aa; 4407 PetscContainer container; 4408 MatCOOStruct_SeqAIJ *coo; 4409 4410 PetscFunctionBegin; 4411 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4412 4413 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4414 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4415 4416 PetscCall(PetscGetMemType(v, &memtype)); 4417 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4418 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4419 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4420 } 4421 4422 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4423 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4424 4425 PetscCall(PetscLogGpuTimeBegin()); 4426 if (Annz) { 4427 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4428 PetscCallCUDA(cudaPeekAtLastError()); 4429 } 4430 PetscCall(PetscLogGpuTimeEnd()); 4431 4432 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4433 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4434 4435 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4436 PetscFunctionReturn(PETSC_SUCCESS); 4437 } 4438 4439 /*@C 4440 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4441 4442 Not Collective 4443 4444 Input Parameters: 4445 + A - the matrix 4446 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4447 4448 Output Parameters: 4449 + i - the CSR row pointers 4450 - j - the CSR column indices 4451 4452 Level: developer 4453 4454 Note: 4455 When compressed is true, the CSR structure does not contain empty rows 4456 4457 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4458 @*/ 4459 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4460 { 4461 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4462 CsrMatrix *csr; 4463 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4464 4465 PetscFunctionBegin; 4466 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4467 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4468 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4469 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4470 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4471 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4472 csr = (CsrMatrix *)cusp->mat->mat; 4473 if (i) { 4474 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4475 if (!cusp->rowoffsets_gpu) { 4476 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4477 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4478 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4479 } 4480 *i = cusp->rowoffsets_gpu->data().get(); 4481 } else *i = csr->row_offsets->data().get(); 4482 } 4483 if (j) *j = csr->column_indices->data().get(); 4484 PetscFunctionReturn(PETSC_SUCCESS); 4485 } 4486 4487 /*@C 4488 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4489 4490 Not Collective 4491 4492 Input Parameters: 4493 + A - the matrix 4494 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4495 . i - the CSR row pointers 4496 - j - the CSR column indices 4497 4498 Level: developer 4499 4500 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4501 @*/ 4502 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4503 { 4504 PetscFunctionBegin; 4505 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4506 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4507 if (i) *i = NULL; 4508 if (j) *j = NULL; 4509 (void)compressed; 4510 PetscFunctionReturn(PETSC_SUCCESS); 4511 } 4512 4513 /*@C 4514 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4515 4516 Not Collective 4517 4518 Input Parameter: 4519 . A - a `MATSEQAIJCUSPARSE` matrix 4520 4521 Output Parameter: 4522 . a - pointer to the device data 4523 4524 Level: developer 4525 4526 Note: 4527 May trigger host-device copies if up-to-date matrix data is on host 4528 4529 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4530 @*/ 4531 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4532 { 4533 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4534 CsrMatrix *csr; 4535 4536 PetscFunctionBegin; 4537 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4538 PetscAssertPointer(a, 2); 4539 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4540 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4541 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4542 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4543 csr = (CsrMatrix *)cusp->mat->mat; 4544 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4545 *a = csr->values->data().get(); 4546 PetscFunctionReturn(PETSC_SUCCESS); 4547 } 4548 4549 /*@C 4550 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4551 4552 Not Collective 4553 4554 Input Parameters: 4555 + A - a `MATSEQAIJCUSPARSE` matrix 4556 - a - pointer to the device data 4557 4558 Level: developer 4559 4560 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4561 @*/ 4562 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4563 { 4564 PetscFunctionBegin; 4565 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4566 PetscAssertPointer(a, 2); 4567 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4568 *a = NULL; 4569 PetscFunctionReturn(PETSC_SUCCESS); 4570 } 4571 4572 /*@C 4573 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4574 4575 Not Collective 4576 4577 Input Parameter: 4578 . A - a `MATSEQAIJCUSPARSE` matrix 4579 4580 Output Parameter: 4581 . a - pointer to the device data 4582 4583 Level: developer 4584 4585 Note: 4586 May trigger host-device copies if up-to-date matrix data is on host 4587 4588 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4589 @*/ 4590 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4591 { 4592 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4593 CsrMatrix *csr; 4594 4595 PetscFunctionBegin; 4596 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4597 PetscAssertPointer(a, 2); 4598 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4599 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4600 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4601 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4602 csr = (CsrMatrix *)cusp->mat->mat; 4603 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4604 *a = csr->values->data().get(); 4605 A->offloadmask = PETSC_OFFLOAD_GPU; 4606 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4607 PetscFunctionReturn(PETSC_SUCCESS); 4608 } 4609 /*@C 4610 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4611 4612 Not Collective 4613 4614 Input Parameters: 4615 + A - a `MATSEQAIJCUSPARSE` matrix 4616 - a - pointer to the device data 4617 4618 Level: developer 4619 4620 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4621 @*/ 4622 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4623 { 4624 PetscFunctionBegin; 4625 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4626 PetscAssertPointer(a, 2); 4627 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4628 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4629 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4630 *a = NULL; 4631 PetscFunctionReturn(PETSC_SUCCESS); 4632 } 4633 4634 /*@C 4635 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4636 4637 Not Collective 4638 4639 Input Parameter: 4640 . A - a `MATSEQAIJCUSPARSE` matrix 4641 4642 Output Parameter: 4643 . a - pointer to the device data 4644 4645 Level: developer 4646 4647 Note: 4648 Does not trigger host-device copies and flags data validity on the GPU 4649 4650 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4651 @*/ 4652 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4653 { 4654 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4655 CsrMatrix *csr; 4656 4657 PetscFunctionBegin; 4658 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4659 PetscAssertPointer(a, 2); 4660 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4661 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4662 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4663 csr = (CsrMatrix *)cusp->mat->mat; 4664 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4665 *a = csr->values->data().get(); 4666 A->offloadmask = PETSC_OFFLOAD_GPU; 4667 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4668 PetscFunctionReturn(PETSC_SUCCESS); 4669 } 4670 4671 /*@C 4672 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4673 4674 Not Collective 4675 4676 Input Parameters: 4677 + A - a `MATSEQAIJCUSPARSE` matrix 4678 - a - pointer to the device data 4679 4680 Level: developer 4681 4682 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4683 @*/ 4684 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4685 { 4686 PetscFunctionBegin; 4687 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4688 PetscAssertPointer(a, 2); 4689 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4690 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4691 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4692 *a = NULL; 4693 PetscFunctionReturn(PETSC_SUCCESS); 4694 } 4695 4696 struct IJCompare4 { 4697 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4698 { 4699 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4700 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4701 return false; 4702 } 4703 }; 4704 4705 struct Shift { 4706 int _shift; 4707 4708 Shift(int shift) : _shift(shift) { } 4709 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4710 }; 4711 4712 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4713 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4714 { 4715 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4716 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4717 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4718 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4719 PetscInt Annz, Bnnz; 4720 cusparseStatus_t stat; 4721 PetscInt i, m, n, zero = 0; 4722 4723 PetscFunctionBegin; 4724 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4725 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4726 PetscAssertPointer(C, 4); 4727 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4728 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4729 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4730 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4731 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4732 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4733 if (reuse == MAT_INITIAL_MATRIX) { 4734 m = A->rmap->n; 4735 n = A->cmap->n + B->cmap->n; 4736 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4737 PetscCall(MatSetSizes(*C, m, n, m, n)); 4738 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4739 c = (Mat_SeqAIJ *)(*C)->data; 4740 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4741 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4742 Ccsr = new CsrMatrix; 4743 Cmat->cprowIndices = NULL; 4744 c->compressedrow.use = PETSC_FALSE; 4745 c->compressedrow.nrows = 0; 4746 c->compressedrow.i = NULL; 4747 c->compressedrow.rindex = NULL; 4748 Ccusp->workVector = NULL; 4749 Ccusp->nrows = m; 4750 Ccusp->mat = Cmat; 4751 Ccusp->mat->mat = Ccsr; 4752 Ccsr->num_rows = m; 4753 Ccsr->num_cols = n; 4754 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4755 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4756 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4757 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4758 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4759 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4760 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4761 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4762 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4763 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4764 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4765 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4766 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4767 4768 Acsr = (CsrMatrix *)Acusp->mat->mat; 4769 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4770 Annz = (PetscInt)Acsr->column_indices->size(); 4771 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4772 c->nz = Annz + Bnnz; 4773 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4774 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4775 Ccsr->values = new THRUSTARRAY(c->nz); 4776 Ccsr->num_entries = c->nz; 4777 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4778 if (c->nz) { 4779 auto Acoo = new THRUSTINTARRAY32(Annz); 4780 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4781 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4782 THRUSTINTARRAY32 *Aroff, *Broff; 4783 4784 if (a->compressedrow.use) { /* need full row offset */ 4785 if (!Acusp->rowoffsets_gpu) { 4786 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4787 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4788 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4789 } 4790 Aroff = Acusp->rowoffsets_gpu; 4791 } else Aroff = Acsr->row_offsets; 4792 if (b->compressedrow.use) { /* need full row offset */ 4793 if (!Bcusp->rowoffsets_gpu) { 4794 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4795 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4796 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4797 } 4798 Broff = Bcusp->rowoffsets_gpu; 4799 } else Broff = Bcsr->row_offsets; 4800 PetscCall(PetscLogGpuTimeBegin()); 4801 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4802 PetscCallCUSPARSE(stat); 4803 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4804 PetscCallCUSPARSE(stat); 4805 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4806 auto Aperm = thrust::make_constant_iterator(1); 4807 auto Bperm = thrust::make_constant_iterator(0); 4808 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4809 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4810 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4811 #else 4812 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4813 auto Bcib = Bcsr->column_indices->begin(); 4814 auto Bcie = Bcsr->column_indices->end(); 4815 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4816 #endif 4817 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4818 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4819 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4820 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4821 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4822 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4823 auto p1 = Ccusp->coords->begin(); 4824 auto p2 = Ccusp->coords->begin(); 4825 thrust::advance(p2, Annz); 4826 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4827 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4828 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4829 #endif 4830 auto cci = thrust::make_counting_iterator(zero); 4831 auto cce = thrust::make_counting_iterator(c->nz); 4832 #if 0 //Errors on SUMMIT cuda 11.1.0 4833 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4834 #else 4835 #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST) 4836 auto pred = thrust::identity<int>(); 4837 #else 4838 auto pred = cuda::std::identity(); 4839 #endif 4840 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4841 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4842 #endif 4843 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4844 PetscCallCUSPARSE(stat); 4845 PetscCall(PetscLogGpuTimeEnd()); 4846 delete wPerm; 4847 delete Acoo; 4848 delete Bcoo; 4849 delete Ccoo; 4850 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4851 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4852 PetscCallCUSPARSE(stat); 4853 #endif 4854 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4855 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4856 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4857 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4858 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4859 CsrMatrix *CcsrT = new CsrMatrix; 4860 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4861 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4862 4863 (*C)->form_explicit_transpose = PETSC_TRUE; 4864 (*C)->transupdated = PETSC_TRUE; 4865 Ccusp->rowoffsets_gpu = NULL; 4866 CmatT->cprowIndices = NULL; 4867 CmatT->mat = CcsrT; 4868 CcsrT->num_rows = n; 4869 CcsrT->num_cols = m; 4870 CcsrT->num_entries = c->nz; 4871 4872 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4873 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4874 CcsrT->values = new THRUSTARRAY(c->nz); 4875 4876 PetscCall(PetscLogGpuTimeBegin()); 4877 auto rT = CcsrT->row_offsets->begin(); 4878 if (AT) { 4879 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4880 thrust::advance(rT, -1); 4881 } 4882 if (BT) { 4883 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4884 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4885 thrust::copy(titb, tite, rT); 4886 } 4887 auto cT = CcsrT->column_indices->begin(); 4888 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4889 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4890 auto vT = CcsrT->values->begin(); 4891 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4892 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4893 PetscCall(PetscLogGpuTimeEnd()); 4894 4895 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4896 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4897 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4898 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4899 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4900 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4901 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4902 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4903 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4904 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4905 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4906 PetscCallCUSPARSE(stat); 4907 #endif 4908 Ccusp->matTranspose = CmatT; 4909 } 4910 } 4911 4912 c->free_a = PETSC_TRUE; 4913 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4914 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4915 c->free_ij = PETSC_TRUE; 4916 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4917 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4918 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4919 ii = *Ccsr->row_offsets; 4920 jj = *Ccsr->column_indices; 4921 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4922 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4923 } else { 4924 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4925 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4926 } 4927 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4928 PetscCall(PetscMalloc1(m, &c->ilen)); 4929 PetscCall(PetscMalloc1(m, &c->imax)); 4930 c->maxnz = c->nz; 4931 c->nonzerorowcnt = 0; 4932 c->rmax = 0; 4933 for (i = 0; i < m; i++) { 4934 const PetscInt nn = c->i[i + 1] - c->i[i]; 4935 c->ilen[i] = c->imax[i] = nn; 4936 c->nonzerorowcnt += (PetscInt)!!nn; 4937 c->rmax = PetscMax(c->rmax, nn); 4938 } 4939 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4940 PetscCall(PetscMalloc1(c->nz, &c->a)); 4941 (*C)->nonzerostate++; 4942 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4943 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4944 Ccusp->nonzerostate = (*C)->nonzerostate; 4945 (*C)->preallocated = PETSC_TRUE; 4946 } else { 4947 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4948 c = (Mat_SeqAIJ *)(*C)->data; 4949 if (c->nz) { 4950 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4951 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4952 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4953 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4954 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4955 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4956 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4957 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4958 Acsr = (CsrMatrix *)Acusp->mat->mat; 4959 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4960 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4961 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4962 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4963 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4964 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4965 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4966 auto pmid = Ccusp->coords->begin(); 4967 thrust::advance(pmid, Acsr->num_entries); 4968 PetscCall(PetscLogGpuTimeBegin()); 4969 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4970 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4971 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4972 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4973 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4974 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4975 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4976 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4977 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4978 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4979 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4980 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4981 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4982 auto vT = CcsrT->values->begin(); 4983 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4984 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4985 (*C)->transupdated = PETSC_TRUE; 4986 } 4987 PetscCall(PetscLogGpuTimeEnd()); 4988 } 4989 } 4990 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4991 (*C)->assembled = PETSC_TRUE; 4992 (*C)->was_assembled = PETSC_FALSE; 4993 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4994 PetscFunctionReturn(PETSC_SUCCESS); 4995 } 4996 4997 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4998 { 4999 bool dmem; 5000 const PetscScalar *av; 5001 5002 PetscFunctionBegin; 5003 dmem = isCudaMem(v); 5004 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5005 if (n && idx) { 5006 THRUSTINTARRAY widx(n); 5007 widx.assign(idx, idx + n); 5008 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5009 5010 THRUSTARRAY *w = NULL; 5011 thrust::device_ptr<PetscScalar> dv; 5012 if (dmem) { 5013 dv = thrust::device_pointer_cast(v); 5014 } else { 5015 w = new THRUSTARRAY(n); 5016 dv = w->data(); 5017 } 5018 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5019 5020 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5021 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5022 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5023 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5024 delete w; 5025 } else { 5026 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5027 } 5028 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5029 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5030 PetscFunctionReturn(PETSC_SUCCESS); 5031 } 5032