1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #endif 19 #include <thrust/iterator/constant_iterator.h> 20 #include <thrust/remove.h> 21 #include <thrust/sort.h> 22 #include <thrust/unique.h> 23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST) 24 #include <cuda/std/functional> 25 #endif 26 27 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 28 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 29 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 30 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 31 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 32 33 typedef enum { 34 CUSPARSE_MV_ALG_DEFAULT = 0, 35 CUSPARSE_COOMV_ALG = 1, 36 CUSPARSE_CSRMV_ALG1 = 2, 37 CUSPARSE_CSRMV_ALG2 = 3 38 } cusparseSpMVAlg_t; 39 40 typedef enum { 41 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 42 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 43 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 44 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 45 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 46 CUSPARSE_SPMM_ALG_DEFAULT = 0, 47 CUSPARSE_SPMM_COO_ALG1 = 1, 48 CUSPARSE_SPMM_COO_ALG2 = 2, 49 CUSPARSE_SPMM_COO_ALG3 = 3, 50 CUSPARSE_SPMM_COO_ALG4 = 5, 51 CUSPARSE_SPMM_CSR_ALG1 = 4, 52 CUSPARSE_SPMM_CSR_ALG2 = 6, 53 } cusparseSpMMAlg_t; 54 55 typedef enum { 56 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 57 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 58 } cusparseCsr2CscAlg_t; 59 */ 60 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 61 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 62 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 63 #endif 64 65 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 66 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 67 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 68 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 69 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 72 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 74 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 75 #endif 76 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 77 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 78 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 79 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 80 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 84 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 85 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 86 87 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 88 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 89 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 90 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 91 92 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 93 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 94 95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 96 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 97 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 98 99 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 100 { 101 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 102 103 PetscFunctionBegin; 104 switch (op) { 105 case MAT_CUSPARSE_MULT: 106 cusparsestruct->format = format; 107 break; 108 case MAT_CUSPARSE_ALL: 109 cusparsestruct->format = format; 110 break; 111 default: 112 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 113 } 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 /*@ 118 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 119 operation. Only the `MatMult()` operation can use different GPU storage formats 120 121 Not Collective 122 123 Input Parameters: 124 + A - Matrix of type `MATSEQAIJCUSPARSE` 125 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 126 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 127 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 128 129 Level: intermediate 130 131 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 132 @*/ 133 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 134 { 135 PetscFunctionBegin; 136 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 137 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 138 PetscFunctionReturn(PETSC_SUCCESS); 139 } 140 141 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 142 { 143 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 144 145 PetscFunctionBegin; 146 cusparsestruct->use_cpu_solve = use_cpu; 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 /*@ 151 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 152 153 Input Parameters: 154 + A - Matrix of type `MATSEQAIJCUSPARSE` 155 - use_cpu - set flag for using the built-in CPU `MatSolve()` 156 157 Level: intermediate 158 159 Note: 160 The cuSparse LU solver currently computes the factors with the built-in CPU method 161 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 162 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 163 164 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 165 @*/ 166 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 167 { 168 PetscFunctionBegin; 169 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 170 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 171 PetscFunctionReturn(PETSC_SUCCESS); 172 } 173 174 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 175 { 176 PetscFunctionBegin; 177 switch (op) { 178 case MAT_FORM_EXPLICIT_TRANSPOSE: 179 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 180 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 181 A->form_explicit_transpose = flg; 182 break; 183 default: 184 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 185 break; 186 } 187 PetscFunctionReturn(PETSC_SUCCESS); 188 } 189 190 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 191 { 192 MatCUSPARSEStorageFormat format; 193 PetscBool flg; 194 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 195 196 PetscFunctionBegin; 197 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 198 if (A->factortype == MAT_FACTOR_NONE) { 199 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 200 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 201 202 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 203 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 204 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 205 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 206 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 207 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 208 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 209 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 210 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 211 #else 212 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 213 #endif 214 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 215 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 216 217 PetscCall( 218 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 219 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 220 #endif 221 } 222 PetscOptionsHeadEnd(); 223 PetscFunctionReturn(PETSC_SUCCESS); 224 } 225 226 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 227 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 228 { 229 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 230 PetscInt m = A->rmap->n; 231 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 232 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 233 const MatScalar *Aa = a->a; 234 PetscInt *Mi, *Mj, Mnz; 235 PetscScalar *Ma; 236 237 PetscFunctionBegin; 238 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 239 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 240 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 241 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 242 PetscCall(PetscMalloc1(m + 1, &Mi)); 243 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 244 PetscCall(PetscMalloc1(Mnz, &Ma)); 245 Mi[0] = 0; 246 for (PetscInt i = 0; i < m; i++) { 247 PetscInt llen = Ai[i + 1] - Ai[i]; 248 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 249 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 250 Mj[Mi[i] + llen] = i; // diagonal entry 251 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 252 Mi[i + 1] = Mi[i] + llen + ulen; 253 } 254 // Copy M (L,U) from host to device 255 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 256 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 257 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 258 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 259 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 260 261 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 262 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 263 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 264 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 265 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 266 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 267 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 268 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 269 270 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 271 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 272 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 273 274 fillMode = CUSPARSE_FILL_MODE_UPPER; 275 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 276 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 277 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 278 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 279 280 // Allocate work vectors in SpSv 281 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 282 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 283 284 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 285 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 286 287 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 288 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 289 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 290 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 291 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 292 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 293 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 294 295 // Record for reuse 296 fs->csrRowPtr_h = Mi; 297 fs->csrVal_h = Ma; 298 PetscCall(PetscFree(Mj)); 299 } 300 // Copy the value 301 Mi = fs->csrRowPtr_h; 302 Ma = fs->csrVal_h; 303 Mnz = Mi[m]; 304 for (PetscInt i = 0; i < m; i++) { 305 PetscInt llen = Ai[i + 1] - Ai[i]; 306 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 307 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 308 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 309 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 310 } 311 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 312 313 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 314 if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 315 // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 316 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 317 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 318 } else 319 #endif 320 { 321 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 322 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 323 324 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 325 fs->updatedSpSVAnalysis = PETSC_TRUE; 326 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 327 } 328 } 329 PetscFunctionReturn(PETSC_SUCCESS); 330 } 331 #else 332 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 333 { 334 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 335 PetscInt n = A->rmap->n; 336 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 337 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 338 const PetscInt *ai = a->i, *aj = a->j, *vi; 339 const MatScalar *aa = a->a, *v; 340 PetscInt *AiLo, *AjLo; 341 PetscInt i, nz, nzLower, offset, rowOffset; 342 343 PetscFunctionBegin; 344 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 345 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 346 try { 347 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 348 nzLower = n + ai[n] - ai[1]; 349 if (!loTriFactor) { 350 PetscScalar *AALo; 351 352 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 353 354 /* Allocate Space for the lower triangular matrix */ 355 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 356 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 357 358 /* Fill the lower triangular matrix */ 359 AiLo[0] = (PetscInt)0; 360 AiLo[n] = nzLower; 361 AjLo[0] = (PetscInt)0; 362 AALo[0] = (MatScalar)1.0; 363 v = aa; 364 vi = aj; 365 offset = 1; 366 rowOffset = 1; 367 for (i = 1; i < n; i++) { 368 nz = ai[i + 1] - ai[i]; 369 /* additional 1 for the term on the diagonal */ 370 AiLo[i] = rowOffset; 371 rowOffset += nz + 1; 372 373 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 374 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 375 376 offset += nz; 377 AjLo[offset] = (PetscInt)i; 378 AALo[offset] = (MatScalar)1.0; 379 offset += 1; 380 381 v += nz; 382 vi += nz; 383 } 384 385 /* allocate space for the triangular factor information */ 386 PetscCall(PetscNew(&loTriFactor)); 387 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 388 /* Create the matrix description */ 389 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 390 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 391 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 392 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 393 #else 394 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 395 #endif 396 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 397 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 398 399 /* set the operation */ 400 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 401 402 /* set the matrix */ 403 loTriFactor->csrMat = new CsrMatrix; 404 loTriFactor->csrMat->num_rows = n; 405 loTriFactor->csrMat->num_cols = n; 406 loTriFactor->csrMat->num_entries = nzLower; 407 408 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 409 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 410 411 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 412 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 413 414 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 415 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 416 417 /* Create the solve analysis information */ 418 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 419 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 420 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 421 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 422 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 423 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 424 #endif 425 426 /* perform the solve analysis */ 427 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 428 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 429 PetscCallCUDA(WaitForCUDA()); 430 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 431 432 /* assign the pointer */ 433 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 434 loTriFactor->AA_h = AALo; 435 PetscCallCUDA(cudaFreeHost(AiLo)); 436 PetscCallCUDA(cudaFreeHost(AjLo)); 437 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 438 } else { /* update values only */ 439 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 440 /* Fill the lower triangular matrix */ 441 loTriFactor->AA_h[0] = 1.0; 442 v = aa; 443 vi = aj; 444 offset = 1; 445 for (i = 1; i < n; i++) { 446 nz = ai[i + 1] - ai[i]; 447 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 448 offset += nz; 449 loTriFactor->AA_h[offset] = 1.0; 450 offset += 1; 451 v += nz; 452 } 453 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 454 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 455 } 456 } catch (char *ex) { 457 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 458 } 459 } 460 PetscFunctionReturn(PETSC_SUCCESS); 461 } 462 463 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 464 { 465 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 466 PetscInt n = A->rmap->n; 467 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 468 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 469 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 470 const MatScalar *aa = a->a, *v; 471 PetscInt *AiUp, *AjUp; 472 PetscInt i, nz, nzUpper, offset; 473 474 PetscFunctionBegin; 475 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 476 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 477 try { 478 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 479 nzUpper = adiag[0] - adiag[n]; 480 if (!upTriFactor) { 481 PetscScalar *AAUp; 482 483 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 484 485 /* Allocate Space for the upper triangular matrix */ 486 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 487 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 488 489 /* Fill the upper triangular matrix */ 490 AiUp[0] = (PetscInt)0; 491 AiUp[n] = nzUpper; 492 offset = nzUpper; 493 for (i = n - 1; i >= 0; i--) { 494 v = aa + adiag[i + 1] + 1; 495 vi = aj + adiag[i + 1] + 1; 496 497 /* number of elements NOT on the diagonal */ 498 nz = adiag[i] - adiag[i + 1] - 1; 499 500 /* decrement the offset */ 501 offset -= (nz + 1); 502 503 /* first, set the diagonal elements */ 504 AjUp[offset] = (PetscInt)i; 505 AAUp[offset] = (MatScalar)1. / v[nz]; 506 AiUp[i] = AiUp[i + 1] - (nz + 1); 507 508 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 509 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 510 } 511 512 /* allocate space for the triangular factor information */ 513 PetscCall(PetscNew(&upTriFactor)); 514 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 515 516 /* Create the matrix description */ 517 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 518 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 519 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 520 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 521 #else 522 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 523 #endif 524 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 525 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 526 527 /* set the operation */ 528 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 529 530 /* set the matrix */ 531 upTriFactor->csrMat = new CsrMatrix; 532 upTriFactor->csrMat->num_rows = n; 533 upTriFactor->csrMat->num_cols = n; 534 upTriFactor->csrMat->num_entries = nzUpper; 535 536 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 537 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 538 539 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 540 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 541 542 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 543 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 544 545 /* Create the solve analysis information */ 546 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 547 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 548 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 549 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 550 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 551 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 552 #endif 553 554 /* perform the solve analysis */ 555 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 556 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 557 558 PetscCallCUDA(WaitForCUDA()); 559 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 560 561 /* assign the pointer */ 562 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 563 upTriFactor->AA_h = AAUp; 564 PetscCallCUDA(cudaFreeHost(AiUp)); 565 PetscCallCUDA(cudaFreeHost(AjUp)); 566 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 567 } else { 568 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 569 /* Fill the upper triangular matrix */ 570 offset = nzUpper; 571 for (i = n - 1; i >= 0; i--) { 572 v = aa + adiag[i + 1] + 1; 573 574 /* number of elements NOT on the diagonal */ 575 nz = adiag[i] - adiag[i + 1] - 1; 576 577 /* decrement the offset */ 578 offset -= (nz + 1); 579 580 /* first, set the diagonal elements */ 581 upTriFactor->AA_h[offset] = 1. / v[nz]; 582 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 583 } 584 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 585 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 586 } 587 } catch (char *ex) { 588 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 589 } 590 } 591 PetscFunctionReturn(PETSC_SUCCESS); 592 } 593 #endif 594 595 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 596 { 597 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 598 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 599 IS isrow = a->row, isicol = a->icol; 600 PetscBool row_identity, col_identity; 601 PetscInt n = A->rmap->n; 602 603 PetscFunctionBegin; 604 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 605 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 606 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 607 #else 608 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 609 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 610 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 611 #endif 612 613 cusparseTriFactors->nnz = a->nz; 614 615 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 616 /* lower triangular indices */ 617 PetscCall(ISIdentity(isrow, &row_identity)); 618 if (!row_identity && !cusparseTriFactors->rpermIndices) { 619 const PetscInt *r; 620 621 PetscCall(ISGetIndices(isrow, &r)); 622 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 623 cusparseTriFactors->rpermIndices->assign(r, r + n); 624 PetscCall(ISRestoreIndices(isrow, &r)); 625 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 626 } 627 628 /* upper triangular indices */ 629 PetscCall(ISIdentity(isicol, &col_identity)); 630 if (!col_identity && !cusparseTriFactors->cpermIndices) { 631 const PetscInt *c; 632 633 PetscCall(ISGetIndices(isicol, &c)); 634 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 635 cusparseTriFactors->cpermIndices->assign(c, c + n); 636 PetscCall(ISRestoreIndices(isicol, &c)); 637 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 638 } 639 PetscFunctionReturn(PETSC_SUCCESS); 640 } 641 642 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 643 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 644 { 645 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 646 PetscInt m = A->rmap->n; 647 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 648 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 649 const MatScalar *Aa = a->a; 650 PetscInt *Mj, Mnz; 651 PetscScalar *Ma, *D; 652 653 PetscFunctionBegin; 654 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 655 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 656 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 657 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 658 Mnz = Ai[m]; // Unz (with the unit diagonal) 659 PetscCall(PetscMalloc1(Mnz, &Ma)); 660 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 661 PetscCall(PetscMalloc1(m, &D)); // the diagonal 662 for (PetscInt i = 0; i < m; i++) { 663 PetscInt ulen = Ai[i + 1] - Ai[i]; 664 Mj[Ai[i]] = i; // diagonal entry 665 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 666 } 667 // Copy M (U) from host to device 668 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 669 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 670 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 671 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 672 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 673 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 674 675 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 676 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 677 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 678 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 679 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 680 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 681 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 682 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 683 684 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 685 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 686 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 687 688 // Allocate work vectors in SpSv 689 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 690 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 691 692 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 693 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 694 695 // Query buffer sizes for SpSV and then allocate buffers 696 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 697 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 698 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 699 700 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 701 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 702 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 703 704 // Record for reuse 705 fs->csrVal_h = Ma; 706 fs->diag_h = D; 707 PetscCall(PetscFree(Mj)); 708 } 709 // Copy the value 710 Ma = fs->csrVal_h; 711 D = fs->diag_h; 712 Mnz = Ai[m]; 713 for (PetscInt i = 0; i < m; i++) { 714 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 715 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 716 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 717 } 718 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 719 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 720 721 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 722 if (fs->updatedSpSVAnalysis) { 723 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 724 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 725 } else 726 #endif 727 { 728 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 729 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 730 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 731 fs->updatedSpSVAnalysis = PETSC_TRUE; 732 } 733 } 734 PetscFunctionReturn(PETSC_SUCCESS); 735 } 736 737 // Solve Ut D U x = b 738 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 739 { 740 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 741 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 742 const PetscScalar *barray; 743 PetscScalar *xarray; 744 thrust::device_ptr<const PetscScalar> bGPU; 745 thrust::device_ptr<PetscScalar> xGPU; 746 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 747 PetscInt m = A->rmap->n; 748 749 PetscFunctionBegin; 750 PetscCall(PetscLogGpuTimeBegin()); 751 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 752 PetscCall(VecCUDAGetArrayRead(b, &barray)); 753 xGPU = thrust::device_pointer_cast(xarray); 754 bGPU = thrust::device_pointer_cast(barray); 755 756 // Reorder b with the row permutation if needed, and wrap the result in fs->X 757 if (fs->rpermIndices) { 758 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 759 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 760 } else { 761 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 762 } 763 764 // Solve Ut Y = X 765 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 766 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 767 768 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 769 // It is basically a vector element-wise multiplication, but cublas does not have it! 770 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 771 772 // Solve U X = Y 773 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 774 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 775 } else { 776 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 777 } 778 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 779 780 // Reorder X with the column permutation if needed, and put the result back to x 781 if (fs->cpermIndices) { 782 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 783 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 784 } 785 786 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 787 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 788 PetscCall(PetscLogGpuTimeEnd()); 789 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 790 PetscFunctionReturn(PETSC_SUCCESS); 791 } 792 #else 793 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 794 { 795 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 796 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 797 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 798 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 799 PetscInt *AiUp, *AjUp; 800 PetscScalar *AAUp; 801 PetscScalar *AALo; 802 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 803 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 804 const PetscInt *ai = b->i, *aj = b->j, *vj; 805 const MatScalar *aa = b->a, *v; 806 807 PetscFunctionBegin; 808 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 809 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 810 try { 811 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 812 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 813 if (!upTriFactor && !loTriFactor) { 814 /* Allocate Space for the upper triangular matrix */ 815 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 816 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 817 818 /* Fill the upper triangular matrix */ 819 AiUp[0] = (PetscInt)0; 820 AiUp[n] = nzUpper; 821 offset = 0; 822 for (i = 0; i < n; i++) { 823 /* set the pointers */ 824 v = aa + ai[i]; 825 vj = aj + ai[i]; 826 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 827 828 /* first, set the diagonal elements */ 829 AjUp[offset] = (PetscInt)i; 830 AAUp[offset] = (MatScalar)1.0 / v[nz]; 831 AiUp[i] = offset; 832 AALo[offset] = (MatScalar)1.0 / v[nz]; 833 834 offset += 1; 835 if (nz > 0) { 836 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 837 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 838 for (j = offset; j < offset + nz; j++) { 839 AAUp[j] = -AAUp[j]; 840 AALo[j] = AAUp[j] / v[nz]; 841 } 842 offset += nz; 843 } 844 } 845 846 /* allocate space for the triangular factor information */ 847 PetscCall(PetscNew(&upTriFactor)); 848 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 849 850 /* Create the matrix description */ 851 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 852 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 853 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 854 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 855 #else 856 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 857 #endif 858 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 859 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 860 861 /* set the matrix */ 862 upTriFactor->csrMat = new CsrMatrix; 863 upTriFactor->csrMat->num_rows = A->rmap->n; 864 upTriFactor->csrMat->num_cols = A->cmap->n; 865 upTriFactor->csrMat->num_entries = a->nz; 866 867 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 868 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 869 870 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 871 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 872 873 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 874 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 875 876 /* set the operation */ 877 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 878 879 /* Create the solve analysis information */ 880 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 881 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 882 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 883 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 884 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 885 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 886 #endif 887 888 /* perform the solve analysis */ 889 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 890 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 891 892 PetscCallCUDA(WaitForCUDA()); 893 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 894 895 /* assign the pointer */ 896 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 897 898 /* allocate space for the triangular factor information */ 899 PetscCall(PetscNew(&loTriFactor)); 900 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 901 902 /* Create the matrix description */ 903 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 904 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 905 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 906 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 907 #else 908 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 909 #endif 910 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 911 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 912 913 /* set the operation */ 914 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 915 916 /* set the matrix */ 917 loTriFactor->csrMat = new CsrMatrix; 918 loTriFactor->csrMat->num_rows = A->rmap->n; 919 loTriFactor->csrMat->num_cols = A->cmap->n; 920 loTriFactor->csrMat->num_entries = a->nz; 921 922 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 923 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 924 925 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 926 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 927 928 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 929 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 930 931 /* Create the solve analysis information */ 932 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 933 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 934 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 935 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 936 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 937 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 938 #endif 939 940 /* perform the solve analysis */ 941 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 942 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 943 944 PetscCallCUDA(WaitForCUDA()); 945 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 946 947 /* assign the pointer */ 948 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 949 950 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 951 PetscCallCUDA(cudaFreeHost(AiUp)); 952 PetscCallCUDA(cudaFreeHost(AjUp)); 953 } else { 954 /* Fill the upper triangular matrix */ 955 offset = 0; 956 for (i = 0; i < n; i++) { 957 /* set the pointers */ 958 v = aa + ai[i]; 959 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 960 961 /* first, set the diagonal elements */ 962 AAUp[offset] = 1.0 / v[nz]; 963 AALo[offset] = 1.0 / v[nz]; 964 965 offset += 1; 966 if (nz > 0) { 967 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 968 for (j = offset; j < offset + nz; j++) { 969 AAUp[j] = -AAUp[j]; 970 AALo[j] = AAUp[j] / v[nz]; 971 } 972 offset += nz; 973 } 974 } 975 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 976 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 977 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 978 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 979 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 980 } 981 PetscCallCUDA(cudaFreeHost(AAUp)); 982 PetscCallCUDA(cudaFreeHost(AALo)); 983 } catch (char *ex) { 984 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 985 } 986 } 987 PetscFunctionReturn(PETSC_SUCCESS); 988 } 989 #endif 990 991 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 992 { 993 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 994 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 995 IS ip = a->row; 996 PetscBool perm_identity; 997 PetscInt n = A->rmap->n; 998 999 PetscFunctionBegin; 1000 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 1001 1002 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1003 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 1004 #else 1005 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 1006 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 1007 #endif 1008 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 1009 1010 A->offloadmask = PETSC_OFFLOAD_BOTH; 1011 1012 /* lower triangular indices */ 1013 PetscCall(ISIdentity(ip, &perm_identity)); 1014 if (!perm_identity) { 1015 IS iip; 1016 const PetscInt *irip, *rip; 1017 1018 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 1019 PetscCall(ISGetIndices(iip, &irip)); 1020 PetscCall(ISGetIndices(ip, &rip)); 1021 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1022 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1023 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1024 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1025 PetscCall(ISRestoreIndices(iip, &irip)); 1026 PetscCall(ISDestroy(&iip)); 1027 PetscCall(ISRestoreIndices(ip, &rip)); 1028 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1029 } 1030 PetscFunctionReturn(PETSC_SUCCESS); 1031 } 1032 1033 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1034 { 1035 PetscFunctionBegin; 1036 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1037 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1038 B->offloadmask = PETSC_OFFLOAD_CPU; 1039 1040 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1041 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1042 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1043 #else 1044 /* determine which version of MatSolve needs to be used. */ 1045 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1046 IS ip = b->row; 1047 PetscBool perm_identity; 1048 1049 PetscCall(ISIdentity(ip, &perm_identity)); 1050 if (perm_identity) { 1051 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1052 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1053 } else { 1054 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1055 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1056 } 1057 #endif 1058 B->ops->matsolve = NULL; 1059 B->ops->matsolvetranspose = NULL; 1060 1061 /* get the triangular factors */ 1062 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1063 PetscFunctionReturn(PETSC_SUCCESS); 1064 } 1065 1066 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1067 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1068 { 1069 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1070 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1071 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1072 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1073 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1074 cusparseIndexBase_t indexBase; 1075 cusparseMatrixType_t matrixType; 1076 cusparseFillMode_t fillMode; 1077 cusparseDiagType_t diagType; 1078 1079 PetscFunctionBegin; 1080 /* allocate space for the transpose of the lower triangular factor */ 1081 PetscCall(PetscNew(&loTriFactorT)); 1082 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1083 1084 /* set the matrix descriptors of the lower triangular factor */ 1085 matrixType = cusparseGetMatType(loTriFactor->descr); 1086 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1087 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1088 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1089 1090 /* Create the matrix description */ 1091 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1092 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1093 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1094 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1095 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1096 1097 /* set the operation */ 1098 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1099 1100 /* allocate GPU space for the CSC of the lower triangular factor*/ 1101 loTriFactorT->csrMat = new CsrMatrix; 1102 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1103 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1104 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1105 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1106 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1107 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1108 1109 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1110 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1111 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1112 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1113 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1114 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1115 #endif 1116 1117 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1118 { 1119 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1120 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1121 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1122 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1123 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1124 #else 1125 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1126 #endif 1127 PetscCallCUSPARSE(stat); 1128 } 1129 1130 PetscCallCUDA(WaitForCUDA()); 1131 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1132 1133 /* Create the solve analysis information */ 1134 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1135 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1136 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1137 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1138 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1139 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1140 #endif 1141 1142 /* perform the solve analysis */ 1143 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1144 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1145 1146 PetscCallCUDA(WaitForCUDA()); 1147 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1148 1149 /* assign the pointer */ 1150 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1151 1152 /*********************************************/ 1153 /* Now the Transpose of the Upper Tri Factor */ 1154 /*********************************************/ 1155 1156 /* allocate space for the transpose of the upper triangular factor */ 1157 PetscCall(PetscNew(&upTriFactorT)); 1158 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1159 1160 /* set the matrix descriptors of the upper triangular factor */ 1161 matrixType = cusparseGetMatType(upTriFactor->descr); 1162 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1163 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1164 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1165 1166 /* Create the matrix description */ 1167 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1168 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1169 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1170 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1171 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1172 1173 /* set the operation */ 1174 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1175 1176 /* allocate GPU space for the CSC of the upper triangular factor*/ 1177 upTriFactorT->csrMat = new CsrMatrix; 1178 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1179 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1180 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1181 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1182 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1183 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1184 1185 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1186 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1187 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1188 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1189 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1190 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1191 #endif 1192 1193 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1194 { 1195 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1196 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1197 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1198 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1199 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1200 #else 1201 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1202 #endif 1203 PetscCallCUSPARSE(stat); 1204 } 1205 1206 PetscCallCUDA(WaitForCUDA()); 1207 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1208 1209 /* Create the solve analysis information */ 1210 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1211 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1212 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1213 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1214 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1215 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1216 #endif 1217 1218 /* perform the solve analysis */ 1219 /* christ, would it have killed you to put this stuff in a function????????? */ 1220 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1221 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1222 1223 PetscCallCUDA(WaitForCUDA()); 1224 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1225 1226 /* assign the pointer */ 1227 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1228 PetscFunctionReturn(PETSC_SUCCESS); 1229 } 1230 #endif 1231 1232 struct PetscScalarToPetscInt { 1233 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1234 }; 1235 1236 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1237 { 1238 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1239 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1240 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1241 cusparseStatus_t stat; 1242 cusparseIndexBase_t indexBase; 1243 1244 PetscFunctionBegin; 1245 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1246 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1247 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1248 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1249 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1250 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1251 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1252 PetscCall(PetscLogGpuTimeBegin()); 1253 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1254 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1255 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1256 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1257 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1258 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1259 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1260 1261 /* set alpha and beta */ 1262 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1263 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1264 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1265 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1266 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1267 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1268 1269 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1270 CsrMatrix *matrixT = new CsrMatrix; 1271 matstructT->mat = matrixT; 1272 matrixT->num_rows = A->cmap->n; 1273 matrixT->num_cols = A->rmap->n; 1274 matrixT->num_entries = a->nz; 1275 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1276 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1277 matrixT->values = new THRUSTARRAY(a->nz); 1278 1279 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1280 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1281 1282 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1283 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1284 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1285 indexBase, cusparse_scalartype); 1286 PetscCallCUSPARSE(stat); 1287 #else 1288 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1289 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1290 1291 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1292 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1293 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1294 */ 1295 if (matrixT->num_entries) { 1296 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1297 PetscCallCUSPARSE(stat); 1298 1299 } else { 1300 matstructT->matDescr = NULL; 1301 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1302 } 1303 #endif 1304 #endif 1305 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1306 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1307 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1308 #else 1309 CsrMatrix *temp = new CsrMatrix; 1310 CsrMatrix *tempT = new CsrMatrix; 1311 /* First convert HYB to CSR */ 1312 temp->num_rows = A->rmap->n; 1313 temp->num_cols = A->cmap->n; 1314 temp->num_entries = a->nz; 1315 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1316 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1317 temp->values = new THRUSTARRAY(a->nz); 1318 1319 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1320 PetscCallCUSPARSE(stat); 1321 1322 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1323 tempT->num_rows = A->rmap->n; 1324 tempT->num_cols = A->cmap->n; 1325 tempT->num_entries = a->nz; 1326 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1327 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1328 tempT->values = new THRUSTARRAY(a->nz); 1329 1330 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1331 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1332 PetscCallCUSPARSE(stat); 1333 1334 /* Last, convert CSC to HYB */ 1335 cusparseHybMat_t hybMat; 1336 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1337 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1338 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1339 PetscCallCUSPARSE(stat); 1340 1341 /* assign the pointer */ 1342 matstructT->mat = hybMat; 1343 A->transupdated = PETSC_TRUE; 1344 /* delete temporaries */ 1345 if (tempT) { 1346 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1347 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1348 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1349 delete (CsrMatrix *)tempT; 1350 } 1351 if (temp) { 1352 if (temp->values) delete (THRUSTARRAY *)temp->values; 1353 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1354 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1355 delete (CsrMatrix *)temp; 1356 } 1357 #endif 1358 } 1359 } 1360 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1361 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1362 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1363 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1364 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1365 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1366 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1367 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1368 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1369 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1370 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1371 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1372 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1373 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1374 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1375 } 1376 if (!cusparsestruct->csr2csc_i) { 1377 THRUSTARRAY csr2csc_a(matrix->num_entries); 1378 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1379 1380 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1381 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1382 void *csr2cscBuffer; 1383 size_t csr2cscBufferSize; 1384 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1385 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1386 PetscCallCUSPARSE(stat); 1387 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1388 #endif 1389 1390 if (matrix->num_entries) { 1391 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1392 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1393 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1394 1395 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1396 should be filled with indexBase. So I just take a shortcut here. 1397 */ 1398 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1399 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1400 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1401 PetscCallCUSPARSE(stat); 1402 #else 1403 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1404 PetscCallCUSPARSE(stat); 1405 #endif 1406 } else { 1407 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1408 } 1409 1410 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1411 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1412 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1413 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1414 #endif 1415 } 1416 PetscCallThrust( 1417 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1418 } 1419 PetscCall(PetscLogGpuTimeEnd()); 1420 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1421 /* the compressed row indices is not used for matTranspose */ 1422 matstructT->cprowIndices = NULL; 1423 /* assign the pointer */ 1424 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1425 A->transupdated = PETSC_TRUE; 1426 PetscFunctionReturn(PETSC_SUCCESS); 1427 } 1428 1429 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1430 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1431 { 1432 const PetscScalar *barray; 1433 PetscScalar *xarray; 1434 thrust::device_ptr<const PetscScalar> bGPU; 1435 thrust::device_ptr<PetscScalar> xGPU; 1436 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1437 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1438 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1439 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1440 PetscInt m = A->rmap->n; 1441 1442 PetscFunctionBegin; 1443 PetscCall(PetscLogGpuTimeBegin()); 1444 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1445 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1446 xGPU = thrust::device_pointer_cast(xarray); 1447 bGPU = thrust::device_pointer_cast(barray); 1448 1449 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1450 if (fs->rpermIndices) { 1451 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1452 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1453 } else { 1454 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1455 } 1456 1457 // Solve L Y = X 1458 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1459 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1460 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1461 1462 // Solve U X = Y 1463 if (fs->cpermIndices) { 1464 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1465 } else { 1466 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1467 } 1468 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1469 1470 // Reorder X with the column permutation if needed, and put the result back to x 1471 if (fs->cpermIndices) { 1472 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1473 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1474 } 1475 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1476 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1477 PetscCall(PetscLogGpuTimeEnd()); 1478 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1479 PetscFunctionReturn(PETSC_SUCCESS); 1480 } 1481 1482 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1483 { 1484 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1485 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1486 const PetscScalar *barray; 1487 PetscScalar *xarray; 1488 thrust::device_ptr<const PetscScalar> bGPU; 1489 thrust::device_ptr<PetscScalar> xGPU; 1490 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1491 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1492 PetscInt m = A->rmap->n; 1493 1494 PetscFunctionBegin; 1495 PetscCall(PetscLogGpuTimeBegin()); 1496 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1497 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1498 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1499 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1500 1501 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1502 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1503 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1504 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1505 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1506 } 1507 1508 if (!fs->updatedTransposeSpSVAnalysis) { 1509 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1510 1511 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1512 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1513 } 1514 1515 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1516 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1517 xGPU = thrust::device_pointer_cast(xarray); 1518 bGPU = thrust::device_pointer_cast(barray); 1519 1520 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1521 if (fs->rpermIndices) { 1522 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1523 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1524 } else { 1525 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1526 } 1527 1528 // Solve Ut Y = X 1529 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1530 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1531 1532 // Solve Lt X = Y 1533 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1534 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1535 } else { 1536 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1537 } 1538 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1539 1540 // Reorder X with the column permutation if needed, and put the result back to x 1541 if (fs->cpermIndices) { 1542 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1543 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1544 } 1545 1546 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1547 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1548 PetscCall(PetscLogGpuTimeEnd()); 1549 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1550 PetscFunctionReturn(PETSC_SUCCESS); 1551 } 1552 #else 1553 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1554 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1555 { 1556 PetscInt n = xx->map->n; 1557 const PetscScalar *barray; 1558 PetscScalar *xarray; 1559 thrust::device_ptr<const PetscScalar> bGPU; 1560 thrust::device_ptr<PetscScalar> xGPU; 1561 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1562 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1563 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1564 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1565 1566 PetscFunctionBegin; 1567 /* Analyze the matrix and create the transpose ... on the fly */ 1568 if (!loTriFactorT && !upTriFactorT) { 1569 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1570 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1571 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1572 } 1573 1574 /* Get the GPU pointers */ 1575 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1576 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1577 xGPU = thrust::device_pointer_cast(xarray); 1578 bGPU = thrust::device_pointer_cast(barray); 1579 1580 PetscCall(PetscLogGpuTimeBegin()); 1581 /* First, reorder with the row permutation */ 1582 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1583 1584 /* First, solve U */ 1585 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1586 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1587 1588 /* Then, solve L */ 1589 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1590 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1591 1592 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1593 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1594 1595 /* Copy the temporary to the full solution. */ 1596 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1597 1598 /* restore */ 1599 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1600 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1601 PetscCall(PetscLogGpuTimeEnd()); 1602 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1603 PetscFunctionReturn(PETSC_SUCCESS); 1604 } 1605 1606 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1607 { 1608 const PetscScalar *barray; 1609 PetscScalar *xarray; 1610 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1611 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1612 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1613 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1614 1615 PetscFunctionBegin; 1616 /* Analyze the matrix and create the transpose ... on the fly */ 1617 if (!loTriFactorT && !upTriFactorT) { 1618 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1619 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1620 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1621 } 1622 1623 /* Get the GPU pointers */ 1624 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1625 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1626 1627 PetscCall(PetscLogGpuTimeBegin()); 1628 /* First, solve U */ 1629 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1630 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1631 1632 /* Then, solve L */ 1633 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1634 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1635 1636 /* restore */ 1637 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1638 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1639 PetscCall(PetscLogGpuTimeEnd()); 1640 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1641 PetscFunctionReturn(PETSC_SUCCESS); 1642 } 1643 1644 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1645 { 1646 const PetscScalar *barray; 1647 PetscScalar *xarray; 1648 thrust::device_ptr<const PetscScalar> bGPU; 1649 thrust::device_ptr<PetscScalar> xGPU; 1650 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1651 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1652 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1653 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1654 1655 PetscFunctionBegin; 1656 /* Get the GPU pointers */ 1657 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1658 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1659 xGPU = thrust::device_pointer_cast(xarray); 1660 bGPU = thrust::device_pointer_cast(barray); 1661 1662 PetscCall(PetscLogGpuTimeBegin()); 1663 /* First, reorder with the row permutation */ 1664 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1665 1666 /* Next, solve L */ 1667 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1668 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1669 1670 /* Then, solve U */ 1671 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1672 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1673 1674 /* Last, reorder with the column permutation */ 1675 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1676 1677 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1678 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1679 PetscCall(PetscLogGpuTimeEnd()); 1680 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1681 PetscFunctionReturn(PETSC_SUCCESS); 1682 } 1683 1684 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1685 { 1686 const PetscScalar *barray; 1687 PetscScalar *xarray; 1688 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1689 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1690 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1691 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1692 1693 PetscFunctionBegin; 1694 /* Get the GPU pointers */ 1695 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1696 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1697 1698 PetscCall(PetscLogGpuTimeBegin()); 1699 /* First, solve L */ 1700 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1701 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1702 1703 /* Next, solve U */ 1704 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1705 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1706 1707 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1708 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1709 PetscCall(PetscLogGpuTimeEnd()); 1710 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1711 PetscFunctionReturn(PETSC_SUCCESS); 1712 } 1713 #endif 1714 1715 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1716 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1717 { 1718 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1719 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1720 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1721 CsrMatrix *Acsr; 1722 PetscInt m, nz; 1723 PetscBool flg; 1724 1725 PetscFunctionBegin; 1726 if (PetscDefined(USE_DEBUG)) { 1727 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1728 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1729 } 1730 1731 /* Copy A's value to fact */ 1732 m = fact->rmap->n; 1733 nz = aij->nz; 1734 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1735 Acsr = (CsrMatrix *)Acusp->mat->mat; 1736 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1737 1738 PetscCall(PetscLogGpuTimeBegin()); 1739 /* Factorize fact inplace */ 1740 if (m) 1741 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1742 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1743 if (PetscDefined(USE_DEBUG)) { 1744 int numerical_zero; 1745 cusparseStatus_t status; 1746 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1747 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1748 } 1749 1750 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1751 if (fs->updatedSpSVAnalysis) { 1752 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1753 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1754 } else 1755 #endif 1756 { 1757 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1758 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1759 */ 1760 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1761 1762 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1763 1764 fs->updatedSpSVAnalysis = PETSC_TRUE; 1765 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1766 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1767 } 1768 1769 fact->offloadmask = PETSC_OFFLOAD_GPU; 1770 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1771 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1772 fact->ops->matsolve = NULL; 1773 fact->ops->matsolvetranspose = NULL; 1774 PetscCall(PetscLogGpuTimeEnd()); 1775 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1776 PetscFunctionReturn(PETSC_SUCCESS); 1777 } 1778 1779 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1780 { 1781 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1782 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1783 PetscInt m, nz; 1784 1785 PetscFunctionBegin; 1786 if (PetscDefined(USE_DEBUG)) { 1787 PetscInt i; 1788 PetscBool flg, missing; 1789 1790 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1791 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1792 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1793 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1794 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1795 } 1796 1797 /* Free the old stale stuff */ 1798 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1799 1800 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1801 but they will not be used. Allocate them just for easy debugging. 1802 */ 1803 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1804 1805 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1806 fact->factortype = MAT_FACTOR_ILU; 1807 fact->info.factor_mallocs = 0; 1808 fact->info.fill_ratio_given = info->fill; 1809 fact->info.fill_ratio_needed = 1.0; 1810 1811 aij->row = NULL; 1812 aij->col = NULL; 1813 1814 /* ====================================================================== */ 1815 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1816 /* We'll do in-place factorization on fact */ 1817 /* ====================================================================== */ 1818 const int *Ai, *Aj; 1819 1820 m = fact->rmap->n; 1821 nz = aij->nz; 1822 1823 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1824 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1825 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1826 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1827 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1828 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1829 1830 /* ====================================================================== */ 1831 /* Create descriptors for M, L, U */ 1832 /* ====================================================================== */ 1833 cusparseFillMode_t fillMode; 1834 cusparseDiagType_t diagType; 1835 1836 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1837 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1838 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1839 1840 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1841 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1842 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1843 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1844 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1845 */ 1846 fillMode = CUSPARSE_FILL_MODE_LOWER; 1847 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1848 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1849 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1850 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1851 1852 fillMode = CUSPARSE_FILL_MODE_UPPER; 1853 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1854 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1855 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1856 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1857 1858 /* ========================================================================= */ 1859 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1860 /* ========================================================================= */ 1861 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1862 if (m) 1863 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1864 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1865 1866 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1867 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1868 1869 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1870 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1871 1872 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1873 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1874 1875 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1876 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1877 1878 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1879 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1880 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1881 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1882 */ 1883 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1884 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1885 fs->spsvBuffer_L = fs->factBuffer_M; 1886 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1887 } else { 1888 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1889 fs->spsvBuffer_U = fs->factBuffer_M; 1890 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1891 } 1892 1893 /* ========================================================================== */ 1894 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1895 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1896 /* ========================================================================== */ 1897 int structural_zero; 1898 cusparseStatus_t status; 1899 1900 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1901 if (m) 1902 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1903 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1904 if (PetscDefined(USE_DEBUG)) { 1905 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1906 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1907 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1908 } 1909 1910 /* Estimate FLOPs of the numeric factorization */ 1911 { 1912 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1913 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1914 PetscLogDouble flops = 0.0; 1915 1916 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1917 Ai = Aseq->i; 1918 Adiag = Aseq->diag; 1919 for (PetscInt i = 0; i < m; i++) { 1920 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1921 nzRow = Ai[i + 1] - Ai[i]; 1922 nzLeft = Adiag[i] - Ai[i]; 1923 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1924 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1925 */ 1926 nzLeft = (nzRow - 1) / 2; 1927 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1928 } 1929 } 1930 fs->numericFactFlops = flops; 1931 } 1932 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1933 PetscFunctionReturn(PETSC_SUCCESS); 1934 } 1935 1936 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1937 { 1938 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1939 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1940 const PetscScalar *barray; 1941 PetscScalar *xarray; 1942 1943 PetscFunctionBegin; 1944 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1945 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1946 PetscCall(PetscLogGpuTimeBegin()); 1947 1948 /* Solve L*y = b */ 1949 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1950 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1951 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1952 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1953 1954 /* Solve Lt*x = y */ 1955 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1956 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1957 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1958 1959 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1960 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1961 1962 PetscCall(PetscLogGpuTimeEnd()); 1963 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1964 PetscFunctionReturn(PETSC_SUCCESS); 1965 } 1966 1967 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1968 { 1969 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1970 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1971 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1972 CsrMatrix *Acsr; 1973 PetscInt m, nz; 1974 PetscBool flg; 1975 1976 PetscFunctionBegin; 1977 if (PetscDefined(USE_DEBUG)) { 1978 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1979 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1980 } 1981 1982 /* Copy A's value to fact */ 1983 m = fact->rmap->n; 1984 nz = aij->nz; 1985 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1986 Acsr = (CsrMatrix *)Acusp->mat->mat; 1987 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1988 1989 /* Factorize fact inplace */ 1990 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1991 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1992 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1993 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1994 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1995 */ 1996 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1997 if (PetscDefined(USE_DEBUG)) { 1998 int numerical_zero; 1999 cusparseStatus_t status; 2000 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 2001 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 2002 } 2003 2004 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 2005 if (fs->updatedSpSVAnalysis) { 2006 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 2007 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 2008 } else 2009 #endif 2010 { 2011 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 2012 2013 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2014 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2015 */ 2016 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 2017 fs->updatedSpSVAnalysis = PETSC_TRUE; 2018 } 2019 2020 fact->offloadmask = PETSC_OFFLOAD_GPU; 2021 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2022 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2023 fact->ops->matsolve = NULL; 2024 fact->ops->matsolvetranspose = NULL; 2025 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2026 PetscFunctionReturn(PETSC_SUCCESS); 2027 } 2028 2029 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2030 { 2031 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2032 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2033 PetscInt m, nz; 2034 2035 PetscFunctionBegin; 2036 if (PetscDefined(USE_DEBUG)) { 2037 PetscInt i; 2038 PetscBool flg, missing; 2039 2040 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2041 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2042 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2043 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2044 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2045 } 2046 2047 /* Free the old stale stuff */ 2048 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2049 2050 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2051 but they will not be used. Allocate them just for easy debugging. 2052 */ 2053 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2054 2055 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2056 fact->factortype = MAT_FACTOR_ICC; 2057 fact->info.factor_mallocs = 0; 2058 fact->info.fill_ratio_given = info->fill; 2059 fact->info.fill_ratio_needed = 1.0; 2060 2061 aij->row = NULL; 2062 aij->col = NULL; 2063 2064 /* ====================================================================== */ 2065 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2066 /* We'll do in-place factorization on fact */ 2067 /* ====================================================================== */ 2068 const int *Ai, *Aj; 2069 2070 m = fact->rmap->n; 2071 nz = aij->nz; 2072 2073 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2074 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2075 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2076 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2077 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2078 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2079 2080 /* ====================================================================== */ 2081 /* Create mat descriptors for M, L */ 2082 /* ====================================================================== */ 2083 cusparseFillMode_t fillMode; 2084 cusparseDiagType_t diagType; 2085 2086 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2087 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2088 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2089 2090 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2091 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2092 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2093 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2094 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2095 */ 2096 fillMode = CUSPARSE_FILL_MODE_LOWER; 2097 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2098 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2099 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2100 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2101 2102 /* ========================================================================= */ 2103 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2104 /* ========================================================================= */ 2105 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2106 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2107 2108 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2109 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2110 2111 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2112 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2113 2114 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2115 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2116 2117 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2118 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2119 2120 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2121 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2122 */ 2123 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2124 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2125 fs->spsvBuffer_L = fs->factBuffer_M; 2126 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2127 } else { 2128 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2129 fs->spsvBuffer_Lt = fs->factBuffer_M; 2130 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2131 } 2132 2133 /* ========================================================================== */ 2134 /* Perform analysis of ic0 on M */ 2135 /* The lower triangular part of M has the same sparsity pattern as L */ 2136 /* ========================================================================== */ 2137 int structural_zero; 2138 cusparseStatus_t status; 2139 2140 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2141 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2142 if (PetscDefined(USE_DEBUG)) { 2143 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2144 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2145 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2146 } 2147 2148 /* Estimate FLOPs of the numeric factorization */ 2149 { 2150 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2151 PetscInt *Ai, nzRow, nzLeft; 2152 PetscLogDouble flops = 0.0; 2153 2154 Ai = Aseq->i; 2155 for (PetscInt i = 0; i < m; i++) { 2156 nzRow = Ai[i + 1] - Ai[i]; 2157 if (nzRow > 1) { 2158 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2159 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2160 */ 2161 nzLeft = (nzRow - 1) / 2; 2162 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2163 } 2164 } 2165 fs->numericFactFlops = flops; 2166 } 2167 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2168 PetscFunctionReturn(PETSC_SUCCESS); 2169 } 2170 #endif 2171 2172 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2173 { 2174 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2175 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2176 2177 PetscFunctionBegin; 2178 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2179 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2180 B->offloadmask = PETSC_OFFLOAD_CPU; 2181 2182 if (!cusparsestruct->use_cpu_solve) { 2183 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2184 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2185 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2186 #else 2187 /* determine which version of MatSolve needs to be used. */ 2188 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2189 IS isrow = b->row, iscol = b->col; 2190 PetscBool row_identity, col_identity; 2191 2192 PetscCall(ISIdentity(isrow, &row_identity)); 2193 PetscCall(ISIdentity(iscol, &col_identity)); 2194 if (row_identity && col_identity) { 2195 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2196 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2197 } else { 2198 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2199 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2200 } 2201 #endif 2202 } 2203 B->ops->matsolve = NULL; 2204 B->ops->matsolvetranspose = NULL; 2205 2206 /* get the triangular factors */ 2207 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2208 PetscFunctionReturn(PETSC_SUCCESS); 2209 } 2210 2211 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2212 { 2213 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2214 2215 PetscFunctionBegin; 2216 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2217 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2218 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2219 PetscFunctionReturn(PETSC_SUCCESS); 2220 } 2221 2222 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2223 { 2224 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2225 2226 PetscFunctionBegin; 2227 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2228 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2229 if (!info->factoronhost) { 2230 PetscCall(ISIdentity(isrow, &row_identity)); 2231 PetscCall(ISIdentity(iscol, &col_identity)); 2232 } 2233 if (!info->levels && row_identity && col_identity) { 2234 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2235 } else 2236 #endif 2237 { 2238 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2239 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2240 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2241 } 2242 PetscFunctionReturn(PETSC_SUCCESS); 2243 } 2244 2245 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2246 { 2247 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2248 2249 PetscFunctionBegin; 2250 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2251 PetscBool perm_identity = PETSC_FALSE; 2252 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2253 if (!info->levels && perm_identity) { 2254 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2255 } else 2256 #endif 2257 { 2258 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2259 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2260 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2261 } 2262 PetscFunctionReturn(PETSC_SUCCESS); 2263 } 2264 2265 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2266 { 2267 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2268 2269 PetscFunctionBegin; 2270 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2271 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2272 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2273 PetscFunctionReturn(PETSC_SUCCESS); 2274 } 2275 2276 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2277 { 2278 PetscFunctionBegin; 2279 *type = MATSOLVERCUSPARSE; 2280 PetscFunctionReturn(PETSC_SUCCESS); 2281 } 2282 2283 /*MC 2284 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2285 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2286 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2287 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2288 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2289 algorithms are not recommended. This class does NOT support direct solver operations. 2290 2291 Level: beginner 2292 2293 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2294 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2295 M*/ 2296 2297 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2298 { 2299 PetscInt n = A->rmap->n; 2300 2301 PetscFunctionBegin; 2302 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2303 PetscCall(MatSetSizes(*B, n, n, n, n)); 2304 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2305 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2306 2307 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2308 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2309 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2310 if (!A->boundtocpu) { 2311 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2312 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2313 } else { 2314 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2315 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2316 } 2317 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2318 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2319 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2320 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2321 if (!A->boundtocpu) { 2322 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2323 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2324 } else { 2325 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2326 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2327 } 2328 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2329 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2330 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2331 2332 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2333 (*B)->canuseordering = PETSC_TRUE; 2334 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2335 PetscFunctionReturn(PETSC_SUCCESS); 2336 } 2337 2338 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2339 { 2340 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2341 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2342 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2343 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2344 #endif 2345 2346 PetscFunctionBegin; 2347 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2348 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2349 if (A->factortype == MAT_FACTOR_NONE) { 2350 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2351 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2352 } 2353 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2354 else if (fs->csrVal) { 2355 /* We have a factorized matrix on device and are able to copy it to host */ 2356 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2357 } 2358 #endif 2359 else 2360 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2361 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2362 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2363 A->offloadmask = PETSC_OFFLOAD_BOTH; 2364 } 2365 PetscFunctionReturn(PETSC_SUCCESS); 2366 } 2367 2368 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2369 { 2370 PetscFunctionBegin; 2371 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2372 *array = ((Mat_SeqAIJ *)A->data)->a; 2373 PetscFunctionReturn(PETSC_SUCCESS); 2374 } 2375 2376 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2377 { 2378 PetscFunctionBegin; 2379 A->offloadmask = PETSC_OFFLOAD_CPU; 2380 *array = NULL; 2381 PetscFunctionReturn(PETSC_SUCCESS); 2382 } 2383 2384 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2385 { 2386 PetscFunctionBegin; 2387 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2388 *array = ((Mat_SeqAIJ *)A->data)->a; 2389 PetscFunctionReturn(PETSC_SUCCESS); 2390 } 2391 2392 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2393 { 2394 PetscFunctionBegin; 2395 *array = NULL; 2396 PetscFunctionReturn(PETSC_SUCCESS); 2397 } 2398 2399 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2400 { 2401 PetscFunctionBegin; 2402 *array = ((Mat_SeqAIJ *)A->data)->a; 2403 PetscFunctionReturn(PETSC_SUCCESS); 2404 } 2405 2406 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2407 { 2408 PetscFunctionBegin; 2409 A->offloadmask = PETSC_OFFLOAD_CPU; 2410 *array = NULL; 2411 PetscFunctionReturn(PETSC_SUCCESS); 2412 } 2413 2414 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2415 { 2416 Mat_SeqAIJCUSPARSE *cusp; 2417 CsrMatrix *matrix; 2418 2419 PetscFunctionBegin; 2420 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2421 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2422 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2423 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2424 matrix = (CsrMatrix *)cusp->mat->mat; 2425 2426 if (i) { 2427 #if !defined(PETSC_USE_64BIT_INDICES) 2428 *i = matrix->row_offsets->data().get(); 2429 #else 2430 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2431 #endif 2432 } 2433 if (j) { 2434 #if !defined(PETSC_USE_64BIT_INDICES) 2435 *j = matrix->column_indices->data().get(); 2436 #else 2437 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2438 #endif 2439 } 2440 if (a) *a = matrix->values->data().get(); 2441 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2442 PetscFunctionReturn(PETSC_SUCCESS); 2443 } 2444 2445 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2446 { 2447 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2448 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2449 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2450 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2451 cusparseStatus_t stat; 2452 PetscBool both = PETSC_TRUE; 2453 2454 PetscFunctionBegin; 2455 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2456 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2457 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2458 CsrMatrix *matrix; 2459 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2460 2461 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2462 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2463 matrix->values->assign(a->a, a->a + a->nz); 2464 PetscCallCUDA(WaitForCUDA()); 2465 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2466 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2467 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2468 } else { 2469 PetscInt nnz; 2470 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2471 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2472 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2473 delete cusparsestruct->workVector; 2474 delete cusparsestruct->rowoffsets_gpu; 2475 cusparsestruct->workVector = NULL; 2476 cusparsestruct->rowoffsets_gpu = NULL; 2477 try { 2478 if (a->compressedrow.use) { 2479 m = a->compressedrow.nrows; 2480 ii = a->compressedrow.i; 2481 ridx = a->compressedrow.rindex; 2482 } else { 2483 m = A->rmap->n; 2484 ii = a->i; 2485 ridx = NULL; 2486 } 2487 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2488 if (!a->a) { 2489 nnz = ii[m]; 2490 both = PETSC_FALSE; 2491 } else nnz = a->nz; 2492 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2493 2494 /* create cusparse matrix */ 2495 cusparsestruct->nrows = m; 2496 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2497 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2498 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2499 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2500 2501 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2502 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2503 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2504 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2505 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2506 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2507 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2508 2509 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2510 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2511 /* set the matrix */ 2512 CsrMatrix *mat = new CsrMatrix; 2513 mat->num_rows = m; 2514 mat->num_cols = A->cmap->n; 2515 mat->num_entries = nnz; 2516 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2517 mat->row_offsets->assign(ii, ii + m + 1); 2518 2519 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2520 mat->column_indices->assign(a->j, a->j + nnz); 2521 2522 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2523 if (a->a) mat->values->assign(a->a, a->a + nnz); 2524 2525 /* assign the pointer */ 2526 matstruct->mat = mat; 2527 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2528 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2529 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2530 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2531 PetscCallCUSPARSE(stat); 2532 } 2533 #endif 2534 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2535 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2536 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2537 #else 2538 CsrMatrix *mat = new CsrMatrix; 2539 mat->num_rows = m; 2540 mat->num_cols = A->cmap->n; 2541 mat->num_entries = nnz; 2542 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2543 mat->row_offsets->assign(ii, ii + m + 1); 2544 2545 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2546 mat->column_indices->assign(a->j, a->j + nnz); 2547 2548 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2549 if (a->a) mat->values->assign(a->a, a->a + nnz); 2550 2551 cusparseHybMat_t hybMat; 2552 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2553 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2554 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2555 PetscCallCUSPARSE(stat); 2556 /* assign the pointer */ 2557 matstruct->mat = hybMat; 2558 2559 if (mat) { 2560 if (mat->values) delete (THRUSTARRAY *)mat->values; 2561 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2562 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2563 delete (CsrMatrix *)mat; 2564 } 2565 #endif 2566 } 2567 2568 /* assign the compressed row indices */ 2569 if (a->compressedrow.use) { 2570 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2571 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2572 matstruct->cprowIndices->assign(ridx, ridx + m); 2573 tmp = m; 2574 } else { 2575 cusparsestruct->workVector = NULL; 2576 matstruct->cprowIndices = NULL; 2577 tmp = 0; 2578 } 2579 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2580 2581 /* assign the pointer */ 2582 cusparsestruct->mat = matstruct; 2583 } catch (char *ex) { 2584 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2585 } 2586 PetscCallCUDA(WaitForCUDA()); 2587 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2588 cusparsestruct->nonzerostate = A->nonzerostate; 2589 } 2590 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2591 } 2592 PetscFunctionReturn(PETSC_SUCCESS); 2593 } 2594 2595 struct VecCUDAPlusEquals { 2596 template <typename Tuple> 2597 __host__ __device__ void operator()(Tuple t) 2598 { 2599 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2600 } 2601 }; 2602 2603 struct VecCUDAEquals { 2604 template <typename Tuple> 2605 __host__ __device__ void operator()(Tuple t) 2606 { 2607 thrust::get<1>(t) = thrust::get<0>(t); 2608 } 2609 }; 2610 2611 struct VecCUDAEqualsReverse { 2612 template <typename Tuple> 2613 __host__ __device__ void operator()(Tuple t) 2614 { 2615 thrust::get<0>(t) = thrust::get<1>(t); 2616 } 2617 }; 2618 2619 struct MatMatCusparse { 2620 PetscBool cisdense; 2621 PetscScalar *Bt; 2622 Mat X; 2623 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2624 PetscLogDouble flops; 2625 CsrMatrix *Bcsr; 2626 2627 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2628 cusparseSpMatDescr_t matSpBDescr; 2629 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2630 cusparseDnMatDescr_t matBDescr; 2631 cusparseDnMatDescr_t matCDescr; 2632 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2633 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2634 void *dBuffer4; 2635 void *dBuffer5; 2636 #endif 2637 size_t mmBufferSize; 2638 void *mmBuffer; 2639 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2640 cusparseSpGEMMDescr_t spgemmDesc; 2641 #endif 2642 }; 2643 2644 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2645 { 2646 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2647 2648 PetscFunctionBegin; 2649 PetscCallCUDA(cudaFree(mmdata->Bt)); 2650 delete mmdata->Bcsr; 2651 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2652 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2653 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2654 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2655 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2656 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2657 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2658 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2659 #endif 2660 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2661 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2662 #endif 2663 PetscCall(MatDestroy(&mmdata->X)); 2664 PetscCall(PetscFree(data)); 2665 PetscFunctionReturn(PETSC_SUCCESS); 2666 } 2667 2668 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2669 2670 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2671 { 2672 Mat_Product *product = C->product; 2673 Mat A, B; 2674 PetscInt m, n, blda, clda; 2675 PetscBool flg, biscuda; 2676 Mat_SeqAIJCUSPARSE *cusp; 2677 cusparseStatus_t stat; 2678 cusparseOperation_t opA; 2679 const PetscScalar *barray; 2680 PetscScalar *carray; 2681 MatMatCusparse *mmdata; 2682 Mat_SeqAIJCUSPARSEMultStruct *mat; 2683 CsrMatrix *csrmat; 2684 2685 PetscFunctionBegin; 2686 MatCheckProduct(C, 1); 2687 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2688 mmdata = (MatMatCusparse *)product->data; 2689 A = product->A; 2690 B = product->B; 2691 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2692 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2693 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2694 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2695 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2696 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2697 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2698 switch (product->type) { 2699 case MATPRODUCT_AB: 2700 case MATPRODUCT_PtAP: 2701 mat = cusp->mat; 2702 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2703 m = A->rmap->n; 2704 n = B->cmap->n; 2705 break; 2706 case MATPRODUCT_AtB: 2707 if (!A->form_explicit_transpose) { 2708 mat = cusp->mat; 2709 opA = CUSPARSE_OPERATION_TRANSPOSE; 2710 } else { 2711 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2712 mat = cusp->matTranspose; 2713 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2714 } 2715 m = A->cmap->n; 2716 n = B->cmap->n; 2717 break; 2718 case MATPRODUCT_ABt: 2719 case MATPRODUCT_RARt: 2720 mat = cusp->mat; 2721 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2722 m = A->rmap->n; 2723 n = B->rmap->n; 2724 break; 2725 default: 2726 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2727 } 2728 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2729 csrmat = (CsrMatrix *)mat->mat; 2730 /* if the user passed a CPU matrix, copy the data to the GPU */ 2731 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2732 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2733 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2734 2735 PetscCall(MatDenseGetLDA(B, &blda)); 2736 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2737 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2738 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2739 } else { 2740 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2741 PetscCall(MatDenseGetLDA(C, &clda)); 2742 } 2743 2744 PetscCall(PetscLogGpuTimeBegin()); 2745 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2746 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2747 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2748 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2749 #else 2750 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2751 #endif 2752 2753 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2754 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2755 size_t mmBufferSize; 2756 if (mmdata->initialized && mmdata->Blda != blda) { 2757 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2758 mmdata->matBDescr = NULL; 2759 } 2760 if (!mmdata->matBDescr) { 2761 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2762 mmdata->Blda = blda; 2763 } 2764 2765 if (mmdata->initialized && mmdata->Clda != clda) { 2766 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2767 mmdata->matCDescr = NULL; 2768 } 2769 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2770 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2771 mmdata->Clda = clda; 2772 } 2773 2774 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2775 if (matADescr) { 2776 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2777 matADescr = NULL; 2778 } 2779 #endif 2780 2781 if (!matADescr) { 2782 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2783 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2784 PetscCallCUSPARSE(stat); 2785 } 2786 2787 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2788 2789 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2790 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2791 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2792 mmdata->mmBufferSize = mmBufferSize; 2793 } 2794 2795 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0 2796 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2797 #endif 2798 2799 mmdata->initialized = PETSC_TRUE; 2800 } else { 2801 /* to be safe, always update pointers of the mats */ 2802 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2803 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2804 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2805 } 2806 2807 /* do cusparseSpMM, which supports transpose on B */ 2808 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2809 #else 2810 PetscInt k; 2811 /* cusparseXcsrmm does not support transpose on B */ 2812 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2813 cublasHandle_t cublasv2handle; 2814 cublasStatus_t cerr; 2815 2816 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2817 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2818 PetscCallCUBLAS(cerr); 2819 blda = B->cmap->n; 2820 k = B->cmap->n; 2821 } else { 2822 k = B->rmap->n; 2823 } 2824 2825 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2826 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2827 PetscCallCUSPARSE(stat); 2828 #endif 2829 PetscCall(PetscLogGpuTimeEnd()); 2830 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2831 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2832 if (product->type == MATPRODUCT_RARt) { 2833 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2834 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2835 } else if (product->type == MATPRODUCT_PtAP) { 2836 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2837 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2838 } else { 2839 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2840 } 2841 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2842 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2843 PetscFunctionReturn(PETSC_SUCCESS); 2844 } 2845 2846 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2847 { 2848 Mat_Product *product = C->product; 2849 Mat A, B; 2850 PetscInt m, n; 2851 PetscBool cisdense, flg; 2852 MatMatCusparse *mmdata; 2853 Mat_SeqAIJCUSPARSE *cusp; 2854 2855 PetscFunctionBegin; 2856 MatCheckProduct(C, 1); 2857 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2858 A = product->A; 2859 B = product->B; 2860 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2861 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2862 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2863 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2864 switch (product->type) { 2865 case MATPRODUCT_AB: 2866 m = A->rmap->n; 2867 n = B->cmap->n; 2868 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2869 break; 2870 case MATPRODUCT_AtB: 2871 m = A->cmap->n; 2872 n = B->cmap->n; 2873 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 2874 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2875 break; 2876 case MATPRODUCT_ABt: 2877 m = A->rmap->n; 2878 n = B->rmap->n; 2879 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 2880 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2881 break; 2882 case MATPRODUCT_PtAP: 2883 m = B->cmap->n; 2884 n = B->cmap->n; 2885 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 2886 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2887 break; 2888 case MATPRODUCT_RARt: 2889 m = B->rmap->n; 2890 n = B->rmap->n; 2891 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 2892 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2893 break; 2894 default: 2895 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2896 } 2897 PetscCall(MatSetSizes(C, m, n, m, n)); 2898 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2899 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2900 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2901 2902 /* product data */ 2903 PetscCall(PetscNew(&mmdata)); 2904 mmdata->cisdense = cisdense; 2905 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2906 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2907 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2908 #endif 2909 /* for these products we need intermediate storage */ 2910 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2911 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2912 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2913 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2914 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2915 } else { 2916 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2917 } 2918 } 2919 C->product->data = mmdata; 2920 C->product->destroy = MatDestroy_MatMatCusparse; 2921 2922 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2923 PetscFunctionReturn(PETSC_SUCCESS); 2924 } 2925 2926 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2927 { 2928 Mat_Product *product = C->product; 2929 Mat A, B; 2930 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2931 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2932 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2933 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2934 PetscBool flg; 2935 cusparseStatus_t stat; 2936 MatProductType ptype; 2937 MatMatCusparse *mmdata; 2938 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2939 cusparseSpMatDescr_t BmatSpDescr; 2940 #endif 2941 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2942 2943 PetscFunctionBegin; 2944 MatCheckProduct(C, 1); 2945 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2946 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2947 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2948 mmdata = (MatMatCusparse *)C->product->data; 2949 A = product->A; 2950 B = product->B; 2951 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2952 mmdata->reusesym = PETSC_FALSE; 2953 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2954 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2955 Cmat = Ccusp->mat; 2956 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2957 Ccsr = (CsrMatrix *)Cmat->mat; 2958 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2959 goto finalize; 2960 } 2961 if (!c->nz) goto finalize; 2962 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2963 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2964 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2965 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2966 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2967 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2968 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2969 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2970 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2971 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2972 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2973 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2974 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2975 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2976 2977 ptype = product->type; 2978 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2979 ptype = MATPRODUCT_AB; 2980 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2981 } 2982 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2983 ptype = MATPRODUCT_AB; 2984 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2985 } 2986 switch (ptype) { 2987 case MATPRODUCT_AB: 2988 Amat = Acusp->mat; 2989 Bmat = Bcusp->mat; 2990 break; 2991 case MATPRODUCT_AtB: 2992 Amat = Acusp->matTranspose; 2993 Bmat = Bcusp->mat; 2994 break; 2995 case MATPRODUCT_ABt: 2996 Amat = Acusp->mat; 2997 Bmat = Bcusp->matTranspose; 2998 break; 2999 default: 3000 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3001 } 3002 Cmat = Ccusp->mat; 3003 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3004 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3005 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 3006 Acsr = (CsrMatrix *)Amat->mat; 3007 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 3008 Ccsr = (CsrMatrix *)Cmat->mat; 3009 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3010 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3011 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 3012 PetscCall(PetscLogGpuTimeBegin()); 3013 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3014 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 3015 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3016 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3017 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3018 PetscCallCUSPARSE(stat); 3019 #else 3020 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3021 PetscCallCUSPARSE(stat); 3022 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3023 PetscCallCUSPARSE(stat); 3024 #endif 3025 #else 3026 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3027 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3028 PetscCallCUSPARSE(stat); 3029 #endif 3030 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3031 PetscCallCUDA(WaitForCUDA()); 3032 PetscCall(PetscLogGpuTimeEnd()); 3033 C->offloadmask = PETSC_OFFLOAD_GPU; 3034 finalize: 3035 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3036 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3037 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3038 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3039 c->reallocs = 0; 3040 C->info.mallocs += 0; 3041 C->info.nz_unneeded = 0; 3042 C->assembled = C->was_assembled = PETSC_TRUE; 3043 C->num_ass++; 3044 PetscFunctionReturn(PETSC_SUCCESS); 3045 } 3046 3047 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3048 { 3049 Mat_Product *product = C->product; 3050 Mat A, B; 3051 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3052 Mat_SeqAIJ *a, *b, *c; 3053 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3054 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3055 PetscInt i, j, m, n, k; 3056 PetscBool flg; 3057 cusparseStatus_t stat; 3058 MatProductType ptype; 3059 MatMatCusparse *mmdata; 3060 PetscLogDouble flops; 3061 PetscBool biscompressed, ciscompressed; 3062 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3063 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3064 cusparseSpMatDescr_t BmatSpDescr; 3065 #else 3066 int cnz; 3067 #endif 3068 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3069 3070 PetscFunctionBegin; 3071 MatCheckProduct(C, 1); 3072 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3073 A = product->A; 3074 B = product->B; 3075 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3076 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3077 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3078 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3079 a = (Mat_SeqAIJ *)A->data; 3080 b = (Mat_SeqAIJ *)B->data; 3081 /* product data */ 3082 PetscCall(PetscNew(&mmdata)); 3083 C->product->data = mmdata; 3084 C->product->destroy = MatDestroy_MatMatCusparse; 3085 3086 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3087 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3088 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3089 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3090 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3091 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3092 3093 ptype = product->type; 3094 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3095 ptype = MATPRODUCT_AB; 3096 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3097 } 3098 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3099 ptype = MATPRODUCT_AB; 3100 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3101 } 3102 biscompressed = PETSC_FALSE; 3103 ciscompressed = PETSC_FALSE; 3104 switch (ptype) { 3105 case MATPRODUCT_AB: 3106 m = A->rmap->n; 3107 n = B->cmap->n; 3108 k = A->cmap->n; 3109 Amat = Acusp->mat; 3110 Bmat = Bcusp->mat; 3111 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3112 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3113 break; 3114 case MATPRODUCT_AtB: 3115 m = A->cmap->n; 3116 n = B->cmap->n; 3117 k = A->rmap->n; 3118 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3119 Amat = Acusp->matTranspose; 3120 Bmat = Bcusp->mat; 3121 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3122 break; 3123 case MATPRODUCT_ABt: 3124 m = A->rmap->n; 3125 n = B->rmap->n; 3126 k = A->cmap->n; 3127 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3128 Amat = Acusp->mat; 3129 Bmat = Bcusp->matTranspose; 3130 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3131 break; 3132 default: 3133 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3134 } 3135 3136 /* create cusparse matrix */ 3137 PetscCall(MatSetSizes(C, m, n, m, n)); 3138 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3139 c = (Mat_SeqAIJ *)C->data; 3140 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3141 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3142 Ccsr = new CsrMatrix; 3143 3144 c->compressedrow.use = ciscompressed; 3145 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3146 c->compressedrow.nrows = a->compressedrow.nrows; 3147 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3148 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3149 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3150 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3151 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3152 } else { 3153 c->compressedrow.nrows = 0; 3154 c->compressedrow.i = NULL; 3155 c->compressedrow.rindex = NULL; 3156 Ccusp->workVector = NULL; 3157 Cmat->cprowIndices = NULL; 3158 } 3159 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3160 Ccusp->mat = Cmat; 3161 Ccusp->mat->mat = Ccsr; 3162 Ccsr->num_rows = Ccusp->nrows; 3163 Ccsr->num_cols = n; 3164 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3165 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3166 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3167 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3168 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3169 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3170 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3171 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3172 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3173 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3174 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3175 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3176 c->nz = 0; 3177 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3178 Ccsr->values = new THRUSTARRAY(c->nz); 3179 goto finalizesym; 3180 } 3181 3182 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3183 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3184 Acsr = (CsrMatrix *)Amat->mat; 3185 if (!biscompressed) { 3186 Bcsr = (CsrMatrix *)Bmat->mat; 3187 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3188 BmatSpDescr = Bmat->matDescr; 3189 #endif 3190 } else { /* we need to use row offsets for the full matrix */ 3191 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3192 Bcsr = new CsrMatrix; 3193 Bcsr->num_rows = B->rmap->n; 3194 Bcsr->num_cols = cBcsr->num_cols; 3195 Bcsr->num_entries = cBcsr->num_entries; 3196 Bcsr->column_indices = cBcsr->column_indices; 3197 Bcsr->values = cBcsr->values; 3198 if (!Bcusp->rowoffsets_gpu) { 3199 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3200 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3201 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3202 } 3203 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3204 mmdata->Bcsr = Bcsr; 3205 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3206 if (Bcsr->num_rows && Bcsr->num_cols) { 3207 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3208 PetscCallCUSPARSE(stat); 3209 } 3210 BmatSpDescr = mmdata->matSpBDescr; 3211 #endif 3212 } 3213 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3214 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3215 /* precompute flops count */ 3216 if (ptype == MATPRODUCT_AB) { 3217 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3218 const PetscInt st = a->i[i]; 3219 const PetscInt en = a->i[i + 1]; 3220 for (j = st; j < en; j++) { 3221 const PetscInt brow = a->j[j]; 3222 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3223 } 3224 } 3225 } else if (ptype == MATPRODUCT_AtB) { 3226 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3227 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3228 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3229 flops += (2. * anzi) * bnzi; 3230 } 3231 } else { /* TODO */ 3232 flops = 0.; 3233 } 3234 3235 mmdata->flops = flops; 3236 PetscCall(PetscLogGpuTimeBegin()); 3237 3238 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3239 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3240 // cuda-12.2 requires non-null csrRowOffsets 3241 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3242 PetscCallCUSPARSE(stat); 3243 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3244 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3245 { 3246 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3247 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3248 */ 3249 void *dBuffer1 = NULL; 3250 void *dBuffer2 = NULL; 3251 void *dBuffer3 = NULL; 3252 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3253 size_t bufferSize1 = 0; 3254 size_t bufferSize2 = 0; 3255 size_t bufferSize3 = 0; 3256 size_t bufferSize4 = 0; 3257 size_t bufferSize5 = 0; 3258 3259 /* ask bufferSize1 bytes for external memory */ 3260 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3261 PetscCallCUSPARSE(stat); 3262 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3263 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3264 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3265 PetscCallCUSPARSE(stat); 3266 3267 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3268 PetscCallCUSPARSE(stat); 3269 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3270 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3271 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3272 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3273 PetscCallCUSPARSE(stat); 3274 PetscCallCUDA(cudaFree(dBuffer1)); 3275 PetscCallCUDA(cudaFree(dBuffer2)); 3276 3277 /* get matrix C non-zero entries C_nnz1 */ 3278 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3279 c->nz = (PetscInt)C_nnz1; 3280 /* allocate matrix C */ 3281 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3282 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3283 Ccsr->values = new THRUSTARRAY(c->nz); 3284 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3285 /* update matC with the new pointers */ 3286 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3287 PetscCallCUSPARSE(stat); 3288 3289 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3290 PetscCallCUSPARSE(stat); 3291 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3292 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3293 PetscCallCUSPARSE(stat); 3294 PetscCallCUDA(cudaFree(dBuffer3)); 3295 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3296 PetscCallCUSPARSE(stat); 3297 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3298 } 3299 #else 3300 size_t bufSize2; 3301 /* ask bufferSize bytes for external memory */ 3302 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3303 PetscCallCUSPARSE(stat); 3304 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3305 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3306 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3307 PetscCallCUSPARSE(stat); 3308 /* ask bufferSize again bytes for external memory */ 3309 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3310 PetscCallCUSPARSE(stat); 3311 /* The CUSPARSE documentation is not clear, nor the API 3312 We need both buffers to perform the operations properly! 3313 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3314 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3315 is stored in the descriptor! What a messy API... */ 3316 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3317 /* compute the intermediate product of A * B */ 3318 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3319 PetscCallCUSPARSE(stat); 3320 /* get matrix C non-zero entries C_nnz1 */ 3321 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3322 c->nz = (PetscInt)C_nnz1; 3323 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3324 mmdata->mmBufferSize / 1024)); 3325 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3326 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3327 Ccsr->values = new THRUSTARRAY(c->nz); 3328 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3329 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3330 PetscCallCUSPARSE(stat); 3331 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3332 PetscCallCUSPARSE(stat); 3333 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3334 #else 3335 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3336 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3337 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3338 PetscCallCUSPARSE(stat); 3339 c->nz = cnz; 3340 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3341 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3342 Ccsr->values = new THRUSTARRAY(c->nz); 3343 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3344 3345 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3346 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3347 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3348 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3349 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3350 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3351 PetscCallCUSPARSE(stat); 3352 #endif 3353 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3354 PetscCall(PetscLogGpuTimeEnd()); 3355 finalizesym: 3356 c->free_a = PETSC_TRUE; 3357 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3358 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3359 c->free_ij = PETSC_TRUE; 3360 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3361 PetscInt *d_i = c->i; 3362 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3363 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3364 ii = *Ccsr->row_offsets; 3365 jj = *Ccsr->column_indices; 3366 if (ciscompressed) d_i = c->compressedrow.i; 3367 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3368 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3369 } else { 3370 PetscInt *d_i = c->i; 3371 if (ciscompressed) d_i = c->compressedrow.i; 3372 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3373 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3374 } 3375 if (ciscompressed) { /* need to expand host row offsets */ 3376 PetscInt r = 0; 3377 c->i[0] = 0; 3378 for (k = 0; k < c->compressedrow.nrows; k++) { 3379 const PetscInt next = c->compressedrow.rindex[k]; 3380 const PetscInt old = c->compressedrow.i[k]; 3381 for (; r < next; r++) c->i[r + 1] = old; 3382 } 3383 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3384 } 3385 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3386 PetscCall(PetscMalloc1(m, &c->ilen)); 3387 PetscCall(PetscMalloc1(m, &c->imax)); 3388 c->maxnz = c->nz; 3389 c->nonzerorowcnt = 0; 3390 c->rmax = 0; 3391 for (k = 0; k < m; k++) { 3392 const PetscInt nn = c->i[k + 1] - c->i[k]; 3393 c->ilen[k] = c->imax[k] = nn; 3394 c->nonzerorowcnt += (PetscInt)!!nn; 3395 c->rmax = PetscMax(c->rmax, nn); 3396 } 3397 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3398 PetscCall(PetscMalloc1(c->nz, &c->a)); 3399 Ccsr->num_entries = c->nz; 3400 3401 C->nonzerostate++; 3402 PetscCall(PetscLayoutSetUp(C->rmap)); 3403 PetscCall(PetscLayoutSetUp(C->cmap)); 3404 Ccusp->nonzerostate = C->nonzerostate; 3405 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3406 C->preallocated = PETSC_TRUE; 3407 C->assembled = PETSC_FALSE; 3408 C->was_assembled = PETSC_FALSE; 3409 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3410 mmdata->reusesym = PETSC_TRUE; 3411 C->offloadmask = PETSC_OFFLOAD_GPU; 3412 } 3413 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3414 PetscFunctionReturn(PETSC_SUCCESS); 3415 } 3416 3417 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3418 3419 /* handles sparse or dense B */ 3420 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3421 { 3422 Mat_Product *product = mat->product; 3423 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3424 3425 PetscFunctionBegin; 3426 MatCheckProduct(mat, 1); 3427 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3428 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3429 if (product->type == MATPRODUCT_ABC) { 3430 Ciscusp = PETSC_FALSE; 3431 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3432 } 3433 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3434 PetscBool usecpu = PETSC_FALSE; 3435 switch (product->type) { 3436 case MATPRODUCT_AB: 3437 if (product->api_user) { 3438 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3439 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3440 PetscOptionsEnd(); 3441 } else { 3442 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3443 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3444 PetscOptionsEnd(); 3445 } 3446 break; 3447 case MATPRODUCT_AtB: 3448 if (product->api_user) { 3449 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3450 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3451 PetscOptionsEnd(); 3452 } else { 3453 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3454 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3455 PetscOptionsEnd(); 3456 } 3457 break; 3458 case MATPRODUCT_PtAP: 3459 if (product->api_user) { 3460 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3461 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3462 PetscOptionsEnd(); 3463 } else { 3464 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3465 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3466 PetscOptionsEnd(); 3467 } 3468 break; 3469 case MATPRODUCT_RARt: 3470 if (product->api_user) { 3471 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3472 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3473 PetscOptionsEnd(); 3474 } else { 3475 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3476 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3477 PetscOptionsEnd(); 3478 } 3479 break; 3480 case MATPRODUCT_ABC: 3481 if (product->api_user) { 3482 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3483 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3484 PetscOptionsEnd(); 3485 } else { 3486 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3487 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3488 PetscOptionsEnd(); 3489 } 3490 break; 3491 default: 3492 break; 3493 } 3494 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3495 } 3496 /* dispatch */ 3497 if (isdense) { 3498 switch (product->type) { 3499 case MATPRODUCT_AB: 3500 case MATPRODUCT_AtB: 3501 case MATPRODUCT_ABt: 3502 case MATPRODUCT_PtAP: 3503 case MATPRODUCT_RARt: 3504 if (product->A->boundtocpu) { 3505 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3506 } else { 3507 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3508 } 3509 break; 3510 case MATPRODUCT_ABC: 3511 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3512 break; 3513 default: 3514 break; 3515 } 3516 } else if (Biscusp && Ciscusp) { 3517 switch (product->type) { 3518 case MATPRODUCT_AB: 3519 case MATPRODUCT_AtB: 3520 case MATPRODUCT_ABt: 3521 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3522 break; 3523 case MATPRODUCT_PtAP: 3524 case MATPRODUCT_RARt: 3525 case MATPRODUCT_ABC: 3526 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3527 break; 3528 default: 3529 break; 3530 } 3531 } else { /* fallback for AIJ */ 3532 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3533 } 3534 PetscFunctionReturn(PETSC_SUCCESS); 3535 } 3536 3537 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3538 { 3539 PetscFunctionBegin; 3540 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3541 PetscFunctionReturn(PETSC_SUCCESS); 3542 } 3543 3544 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3545 { 3546 PetscFunctionBegin; 3547 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3548 PetscFunctionReturn(PETSC_SUCCESS); 3549 } 3550 3551 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3552 { 3553 PetscFunctionBegin; 3554 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3555 PetscFunctionReturn(PETSC_SUCCESS); 3556 } 3557 3558 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3559 { 3560 PetscFunctionBegin; 3561 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3562 PetscFunctionReturn(PETSC_SUCCESS); 3563 } 3564 3565 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3566 { 3567 PetscFunctionBegin; 3568 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3569 PetscFunctionReturn(PETSC_SUCCESS); 3570 } 3571 3572 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3573 { 3574 int i = blockIdx.x * blockDim.x + threadIdx.x; 3575 if (i < n) y[idx[i]] += x[i]; 3576 } 3577 3578 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3579 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3580 { 3581 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3582 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3583 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3584 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3585 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3586 PetscBool compressed; 3587 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3588 PetscInt nx, ny; 3589 #endif 3590 3591 PetscFunctionBegin; 3592 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3593 if (!a->nz) { 3594 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3595 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3596 PetscFunctionReturn(PETSC_SUCCESS); 3597 } 3598 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3599 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3600 if (!trans) { 3601 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3602 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3603 } else { 3604 if (herm || !A->form_explicit_transpose) { 3605 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3606 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3607 } else { 3608 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3609 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3610 } 3611 } 3612 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3613 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3614 3615 try { 3616 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3617 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3618 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3619 3620 PetscCall(PetscLogGpuTimeBegin()); 3621 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3622 /* z = A x + beta y. 3623 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3624 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3625 */ 3626 xptr = xarray; 3627 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3628 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3629 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3630 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3631 allocated to accommodate different uses. So we get the length info directly from mat. 3632 */ 3633 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3634 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3635 nx = mat->num_cols; // since y = Ax 3636 ny = mat->num_rows; 3637 } 3638 #endif 3639 } else { 3640 /* z = A^T x + beta y 3641 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3642 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3643 */ 3644 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3645 dptr = zarray; 3646 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3647 if (compressed) { /* Scatter x to work vector */ 3648 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3649 3650 thrust::for_each( 3651 #if PetscDefined(HAVE_THRUST_ASYNC) 3652 thrust::cuda::par.on(PetscDefaultCudaStream), 3653 #endif 3654 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3655 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3656 } 3657 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3658 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3659 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3660 nx = mat->num_rows; // since y = A^T x 3661 ny = mat->num_cols; 3662 } 3663 #endif 3664 } 3665 3666 /* csr_spmv does y = alpha op(A) x + beta y */ 3667 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3668 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3669 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3670 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3671 #else 3672 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3673 #endif 3674 3675 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3676 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3677 if (!matDescr) { 3678 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3679 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3680 } 3681 #endif 3682 3683 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3684 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3685 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3686 PetscCallCUSPARSE( 3687 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3688 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3689 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3690 PetscCallCUSPARSE( 3691 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3692 #endif 3693 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3694 } else { 3695 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3696 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3697 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3698 } 3699 3700 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3701 #else 3702 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3703 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3704 #endif 3705 } else { 3706 if (cusparsestruct->nrows) { 3707 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3708 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3709 #else 3710 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3711 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3712 #endif 3713 } 3714 } 3715 PetscCall(PetscLogGpuTimeEnd()); 3716 3717 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3718 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3719 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3720 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3721 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3722 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3723 } 3724 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3725 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3726 } 3727 3728 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3729 if (compressed) { 3730 PetscCall(PetscLogGpuTimeBegin()); 3731 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3732 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3733 PetscCall(PetscLogGpuTimeEnd()); 3734 } 3735 } else { 3736 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3737 } 3738 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3739 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3740 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3741 } catch (char *ex) { 3742 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3743 } 3744 if (yy) { 3745 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3746 } else { 3747 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3748 } 3749 PetscFunctionReturn(PETSC_SUCCESS); 3750 } 3751 3752 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3753 { 3754 PetscFunctionBegin; 3755 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3756 PetscFunctionReturn(PETSC_SUCCESS); 3757 } 3758 3759 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3760 { 3761 PetscFunctionBegin; 3762 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3763 PetscFunctionReturn(PETSC_SUCCESS); 3764 } 3765 3766 /*@ 3767 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3768 (the default parallel PETSc format). 3769 3770 Collective 3771 3772 Input Parameters: 3773 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3774 . m - number of rows 3775 . n - number of columns 3776 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3777 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3778 3779 Output Parameter: 3780 . A - the matrix 3781 3782 Level: intermediate 3783 3784 Notes: 3785 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3786 calculations. For good matrix assembly performance the user should preallocate the matrix 3787 storage by setting the parameter `nz` (or the array `nnz`). 3788 3789 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3790 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3791 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3792 3793 The AIJ format, also called 3794 compressed row storage, is fully compatible with standard Fortran 3795 storage. That is, the stored row and column indices can begin at 3796 either one (as in Fortran) or zero. 3797 3798 Specify the preallocated storage with either nz or nnz (not both). 3799 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3800 allocation. 3801 3802 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 3803 @*/ 3804 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3805 { 3806 PetscFunctionBegin; 3807 PetscCall(MatCreate(comm, A)); 3808 PetscCall(MatSetSizes(*A, m, n, m, n)); 3809 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3810 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3811 PetscFunctionReturn(PETSC_SUCCESS); 3812 } 3813 3814 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3815 { 3816 PetscFunctionBegin; 3817 if (A->factortype == MAT_FACTOR_NONE) { 3818 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3819 } else { 3820 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3821 } 3822 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3823 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3824 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3825 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3826 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3827 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3828 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3829 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3830 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3831 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3832 PetscCall(MatDestroy_SeqAIJ(A)); 3833 PetscFunctionReturn(PETSC_SUCCESS); 3834 } 3835 3836 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3837 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3838 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3839 { 3840 PetscFunctionBegin; 3841 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3842 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3843 PetscFunctionReturn(PETSC_SUCCESS); 3844 } 3845 3846 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3847 { 3848 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3849 Mat_SeqAIJCUSPARSE *cy; 3850 Mat_SeqAIJCUSPARSE *cx; 3851 PetscScalar *ay; 3852 const PetscScalar *ax; 3853 CsrMatrix *csry, *csrx; 3854 3855 PetscFunctionBegin; 3856 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3857 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3858 if (X->ops->axpy != Y->ops->axpy) { 3859 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3860 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3861 PetscFunctionReturn(PETSC_SUCCESS); 3862 } 3863 /* if we are here, it means both matrices are bound to GPU */ 3864 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3865 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3866 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3867 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3868 csry = (CsrMatrix *)cy->mat->mat; 3869 csrx = (CsrMatrix *)cx->mat->mat; 3870 /* see if we can turn this into a cublas axpy */ 3871 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3872 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3873 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3874 if (eq) str = SAME_NONZERO_PATTERN; 3875 } 3876 /* spgeam is buggy with one column */ 3877 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3878 3879 if (str == SUBSET_NONZERO_PATTERN) { 3880 PetscScalar b = 1.0; 3881 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3882 size_t bufferSize; 3883 void *buffer; 3884 #endif 3885 3886 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3887 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3888 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3889 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3890 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3891 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3892 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3893 PetscCall(PetscLogGpuTimeBegin()); 3894 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3895 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3896 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3897 PetscCall(PetscLogGpuTimeEnd()); 3898 PetscCallCUDA(cudaFree(buffer)); 3899 #else 3900 PetscCall(PetscLogGpuTimeBegin()); 3901 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3902 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3903 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3904 PetscCall(PetscLogGpuTimeEnd()); 3905 #endif 3906 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3907 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3908 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3909 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3910 } else if (str == SAME_NONZERO_PATTERN) { 3911 cublasHandle_t cublasv2handle; 3912 PetscBLASInt one = 1, bnz = 1; 3913 3914 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3915 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3916 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3917 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3918 PetscCall(PetscLogGpuTimeBegin()); 3919 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3920 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3921 PetscCall(PetscLogGpuTimeEnd()); 3922 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3923 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3924 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3925 } else { 3926 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3927 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3928 } 3929 PetscFunctionReturn(PETSC_SUCCESS); 3930 } 3931 3932 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3933 { 3934 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3935 PetscScalar *ay; 3936 cublasHandle_t cublasv2handle; 3937 PetscBLASInt one = 1, bnz = 1; 3938 3939 PetscFunctionBegin; 3940 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3941 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3942 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3943 PetscCall(PetscLogGpuTimeBegin()); 3944 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3945 PetscCall(PetscLogGpuFlops(bnz)); 3946 PetscCall(PetscLogGpuTimeEnd()); 3947 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3948 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3949 PetscFunctionReturn(PETSC_SUCCESS); 3950 } 3951 3952 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3953 { 3954 PetscBool both = PETSC_FALSE; 3955 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3956 3957 PetscFunctionBegin; 3958 if (A->factortype == MAT_FACTOR_NONE) { 3959 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3960 if (spptr->mat) { 3961 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3962 if (matrix->values) { 3963 both = PETSC_TRUE; 3964 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3965 } 3966 } 3967 if (spptr->matTranspose) { 3968 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3969 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3970 } 3971 } 3972 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3973 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3974 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3975 else A->offloadmask = PETSC_OFFLOAD_CPU; 3976 PetscFunctionReturn(PETSC_SUCCESS); 3977 } 3978 3979 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3980 { 3981 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3982 3983 PetscFunctionBegin; 3984 if (A->factortype != MAT_FACTOR_NONE) { 3985 A->boundtocpu = flg; 3986 PetscFunctionReturn(PETSC_SUCCESS); 3987 } 3988 if (flg) { 3989 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3990 3991 A->ops->scale = MatScale_SeqAIJ; 3992 A->ops->axpy = MatAXPY_SeqAIJ; 3993 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3994 A->ops->mult = MatMult_SeqAIJ; 3995 A->ops->multadd = MatMultAdd_SeqAIJ; 3996 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3997 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3998 A->ops->multhermitiantranspose = NULL; 3999 A->ops->multhermitiantransposeadd = NULL; 4000 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4001 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 4002 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 4003 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 4004 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 4005 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 4006 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 4007 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 4008 } else { 4009 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4010 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4011 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4012 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4013 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4014 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4015 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4016 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4017 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4018 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4019 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4020 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4021 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4022 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4023 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4024 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4025 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4026 4027 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4028 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4029 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4030 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4031 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4032 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4033 } 4034 A->boundtocpu = flg; 4035 if (flg && a->inode.size_csr) { 4036 a->inode.use = PETSC_TRUE; 4037 } else { 4038 a->inode.use = PETSC_FALSE; 4039 } 4040 PetscFunctionReturn(PETSC_SUCCESS); 4041 } 4042 4043 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4044 { 4045 Mat B; 4046 4047 PetscFunctionBegin; 4048 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4049 if (reuse == MAT_INITIAL_MATRIX) { 4050 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4051 } else if (reuse == MAT_REUSE_MATRIX) { 4052 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4053 } 4054 B = *newmat; 4055 4056 PetscCall(PetscFree(B->defaultvectype)); 4057 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4058 4059 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4060 if (B->factortype == MAT_FACTOR_NONE) { 4061 Mat_SeqAIJCUSPARSE *spptr; 4062 PetscCall(PetscNew(&spptr)); 4063 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4064 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4065 spptr->format = MAT_CUSPARSE_CSR; 4066 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4067 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4068 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4069 #else 4070 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4071 #endif 4072 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4073 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4074 #endif 4075 B->spptr = spptr; 4076 } else { 4077 Mat_SeqAIJCUSPARSETriFactors *spptr; 4078 4079 PetscCall(PetscNew(&spptr)); 4080 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4081 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4082 B->spptr = spptr; 4083 } 4084 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4085 } 4086 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4087 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4088 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4089 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4090 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4091 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4092 4093 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4094 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4095 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4096 #if defined(PETSC_HAVE_HYPRE) 4097 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4098 #endif 4099 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4100 PetscFunctionReturn(PETSC_SUCCESS); 4101 } 4102 4103 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4104 { 4105 PetscFunctionBegin; 4106 PetscCall(MatCreate_SeqAIJ(B)); 4107 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4108 PetscFunctionReturn(PETSC_SUCCESS); 4109 } 4110 4111 /*MC 4112 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4113 4114 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 4115 CSR, ELL, or Hybrid format. 4116 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4117 4118 Options Database Keys: 4119 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4120 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4121 Other options include ell (ellpack) or hyb (hybrid). 4122 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4123 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4124 4125 Level: beginner 4126 4127 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4128 M*/ 4129 4130 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4131 { 4132 PetscFunctionBegin; 4133 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4134 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4135 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4136 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4137 PetscFunctionReturn(PETSC_SUCCESS); 4138 } 4139 4140 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4141 { 4142 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4143 4144 PetscFunctionBegin; 4145 if (cusp) { 4146 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4147 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4148 delete cusp->workVector; 4149 delete cusp->rowoffsets_gpu; 4150 delete cusp->csr2csc_i; 4151 delete cusp->coords; 4152 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4153 PetscCall(PetscFree(mat->spptr)); 4154 } 4155 PetscFunctionReturn(PETSC_SUCCESS); 4156 } 4157 4158 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4159 { 4160 PetscFunctionBegin; 4161 if (*mat) { 4162 delete (*mat)->values; 4163 delete (*mat)->column_indices; 4164 delete (*mat)->row_offsets; 4165 delete *mat; 4166 *mat = 0; 4167 } 4168 PetscFunctionReturn(PETSC_SUCCESS); 4169 } 4170 4171 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4172 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4173 { 4174 PetscFunctionBegin; 4175 if (*trifactor) { 4176 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4177 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4178 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4179 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4180 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4181 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4182 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4183 #endif 4184 PetscCall(PetscFree(*trifactor)); 4185 } 4186 PetscFunctionReturn(PETSC_SUCCESS); 4187 } 4188 #endif 4189 4190 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4191 { 4192 CsrMatrix *mat; 4193 4194 PetscFunctionBegin; 4195 if (*matstruct) { 4196 if ((*matstruct)->mat) { 4197 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4198 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4199 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4200 #else 4201 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4202 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4203 #endif 4204 } else { 4205 mat = (CsrMatrix *)(*matstruct)->mat; 4206 PetscCall(CsrMatrix_Destroy(&mat)); 4207 } 4208 } 4209 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4210 delete (*matstruct)->cprowIndices; 4211 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4212 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4213 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4214 4215 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4216 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4217 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4218 4219 for (int i = 0; i < 3; i++) { 4220 if (mdata->cuSpMV[i].initialized) { 4221 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4222 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4223 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4224 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4225 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4226 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4227 #endif 4228 } 4229 } 4230 #endif 4231 delete *matstruct; 4232 *matstruct = NULL; 4233 } 4234 PetscFunctionReturn(PETSC_SUCCESS); 4235 } 4236 4237 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4238 { 4239 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4240 4241 PetscFunctionBegin; 4242 if (fs) { 4243 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4244 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4245 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4246 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4247 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4248 delete fs->workVector; 4249 fs->workVector = NULL; 4250 #endif 4251 delete fs->rpermIndices; 4252 delete fs->cpermIndices; 4253 fs->rpermIndices = NULL; 4254 fs->cpermIndices = NULL; 4255 fs->init_dev_prop = PETSC_FALSE; 4256 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4257 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4258 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4259 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4260 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4261 PetscCallCUDA(cudaFree(fs->csrVal)); 4262 PetscCallCUDA(cudaFree(fs->diag)); 4263 PetscCallCUDA(cudaFree(fs->X)); 4264 PetscCallCUDA(cudaFree(fs->Y)); 4265 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4266 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4267 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4268 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4269 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4270 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4271 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4272 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4273 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4274 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4275 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4276 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4277 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4278 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4279 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4280 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4281 PetscCall(PetscFree(fs->csrRowPtr_h)); 4282 PetscCall(PetscFree(fs->csrVal_h)); 4283 PetscCall(PetscFree(fs->diag_h)); 4284 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4285 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4286 #endif 4287 } 4288 PetscFunctionReturn(PETSC_SUCCESS); 4289 } 4290 4291 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4292 { 4293 PetscFunctionBegin; 4294 if (*trifactors) { 4295 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4296 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4297 PetscCall(PetscFree(*trifactors)); 4298 } 4299 PetscFunctionReturn(PETSC_SUCCESS); 4300 } 4301 4302 struct IJCompare { 4303 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4304 { 4305 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4306 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4307 return false; 4308 } 4309 }; 4310 4311 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4312 { 4313 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4314 4315 PetscFunctionBegin; 4316 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4317 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4318 if (destroy) { 4319 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4320 delete cusp->csr2csc_i; 4321 cusp->csr2csc_i = NULL; 4322 } 4323 A->transupdated = PETSC_FALSE; 4324 PetscFunctionReturn(PETSC_SUCCESS); 4325 } 4326 4327 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4328 { 4329 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 4330 4331 PetscFunctionBegin; 4332 PetscCallCUDA(cudaFree(coo->perm)); 4333 PetscCallCUDA(cudaFree(coo->jmap)); 4334 PetscCall(PetscFree(coo)); 4335 PetscFunctionReturn(PETSC_SUCCESS); 4336 } 4337 4338 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4339 { 4340 PetscBool dev_ij = PETSC_FALSE; 4341 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4342 PetscInt *i, *j; 4343 PetscContainer container_h; 4344 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4345 4346 PetscFunctionBegin; 4347 PetscCall(PetscGetMemType(coo_i, &mtype)); 4348 if (PetscMemTypeDevice(mtype)) { 4349 dev_ij = PETSC_TRUE; 4350 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4351 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4352 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4353 } else { 4354 i = coo_i; 4355 j = coo_j; 4356 } 4357 4358 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4359 if (dev_ij) PetscCall(PetscFree2(i, j)); 4360 mat->offloadmask = PETSC_OFFLOAD_CPU; 4361 // Create the GPU memory 4362 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4363 4364 // Copy the COO struct to device 4365 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4366 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4367 PetscCall(PetscMalloc1(1, &coo_d)); 4368 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4369 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4370 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4371 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4372 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4373 4374 // Put the COO struct in a container and then attach that to the matrix 4375 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4376 PetscFunctionReturn(PETSC_SUCCESS); 4377 } 4378 4379 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4380 { 4381 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4382 const PetscCount grid_size = gridDim.x * blockDim.x; 4383 for (; i < nnz; i += grid_size) { 4384 PetscScalar sum = 0.0; 4385 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4386 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4387 } 4388 } 4389 4390 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4391 { 4392 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4393 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4394 PetscCount Annz = seq->nz; 4395 PetscMemType memtype; 4396 const PetscScalar *v1 = v; 4397 PetscScalar *Aa; 4398 PetscContainer container; 4399 MatCOOStruct_SeqAIJ *coo; 4400 4401 PetscFunctionBegin; 4402 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4403 4404 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4405 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4406 4407 PetscCall(PetscGetMemType(v, &memtype)); 4408 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4409 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4410 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4411 } 4412 4413 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4414 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4415 4416 PetscCall(PetscLogGpuTimeBegin()); 4417 if (Annz) { 4418 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4419 PetscCallCUDA(cudaPeekAtLastError()); 4420 } 4421 PetscCall(PetscLogGpuTimeEnd()); 4422 4423 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4424 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4425 4426 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4427 PetscFunctionReturn(PETSC_SUCCESS); 4428 } 4429 4430 /*@C 4431 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4432 4433 Not Collective 4434 4435 Input Parameters: 4436 + A - the matrix 4437 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4438 4439 Output Parameters: 4440 + i - the CSR row pointers 4441 - j - the CSR column indices 4442 4443 Level: developer 4444 4445 Note: 4446 When compressed is true, the CSR structure does not contain empty rows 4447 4448 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4449 @*/ 4450 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4451 { 4452 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4453 CsrMatrix *csr; 4454 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4455 4456 PetscFunctionBegin; 4457 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4458 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4459 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4460 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4461 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4462 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4463 csr = (CsrMatrix *)cusp->mat->mat; 4464 if (i) { 4465 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4466 if (!cusp->rowoffsets_gpu) { 4467 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4468 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4469 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4470 } 4471 *i = cusp->rowoffsets_gpu->data().get(); 4472 } else *i = csr->row_offsets->data().get(); 4473 } 4474 if (j) *j = csr->column_indices->data().get(); 4475 PetscFunctionReturn(PETSC_SUCCESS); 4476 } 4477 4478 /*@C 4479 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4480 4481 Not Collective 4482 4483 Input Parameters: 4484 + A - the matrix 4485 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4486 . i - the CSR row pointers 4487 - j - the CSR column indices 4488 4489 Level: developer 4490 4491 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4492 @*/ 4493 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4494 { 4495 PetscFunctionBegin; 4496 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4497 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4498 if (i) *i = NULL; 4499 if (j) *j = NULL; 4500 (void)compressed; 4501 PetscFunctionReturn(PETSC_SUCCESS); 4502 } 4503 4504 /*@C 4505 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4506 4507 Not Collective 4508 4509 Input Parameter: 4510 . A - a `MATSEQAIJCUSPARSE` matrix 4511 4512 Output Parameter: 4513 . a - pointer to the device data 4514 4515 Level: developer 4516 4517 Note: 4518 May trigger host-device copies if up-to-date matrix data is on host 4519 4520 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4521 @*/ 4522 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4523 { 4524 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4525 CsrMatrix *csr; 4526 4527 PetscFunctionBegin; 4528 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4529 PetscAssertPointer(a, 2); 4530 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4531 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4532 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4533 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4534 csr = (CsrMatrix *)cusp->mat->mat; 4535 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4536 *a = csr->values->data().get(); 4537 PetscFunctionReturn(PETSC_SUCCESS); 4538 } 4539 4540 /*@C 4541 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4542 4543 Not Collective 4544 4545 Input Parameters: 4546 + A - a `MATSEQAIJCUSPARSE` matrix 4547 - a - pointer to the device data 4548 4549 Level: developer 4550 4551 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4552 @*/ 4553 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4554 { 4555 PetscFunctionBegin; 4556 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4557 PetscAssertPointer(a, 2); 4558 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4559 *a = NULL; 4560 PetscFunctionReturn(PETSC_SUCCESS); 4561 } 4562 4563 /*@C 4564 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4565 4566 Not Collective 4567 4568 Input Parameter: 4569 . A - a `MATSEQAIJCUSPARSE` matrix 4570 4571 Output Parameter: 4572 . a - pointer to the device data 4573 4574 Level: developer 4575 4576 Note: 4577 May trigger host-device copies if up-to-date matrix data is on host 4578 4579 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4580 @*/ 4581 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4582 { 4583 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4584 CsrMatrix *csr; 4585 4586 PetscFunctionBegin; 4587 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4588 PetscAssertPointer(a, 2); 4589 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4590 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4591 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4592 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4593 csr = (CsrMatrix *)cusp->mat->mat; 4594 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4595 *a = csr->values->data().get(); 4596 A->offloadmask = PETSC_OFFLOAD_GPU; 4597 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4598 PetscFunctionReturn(PETSC_SUCCESS); 4599 } 4600 /*@C 4601 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4602 4603 Not Collective 4604 4605 Input Parameters: 4606 + A - a `MATSEQAIJCUSPARSE` matrix 4607 - a - pointer to the device data 4608 4609 Level: developer 4610 4611 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4612 @*/ 4613 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4614 { 4615 PetscFunctionBegin; 4616 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4617 PetscAssertPointer(a, 2); 4618 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4619 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4620 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4621 *a = NULL; 4622 PetscFunctionReturn(PETSC_SUCCESS); 4623 } 4624 4625 /*@C 4626 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4627 4628 Not Collective 4629 4630 Input Parameter: 4631 . A - a `MATSEQAIJCUSPARSE` matrix 4632 4633 Output Parameter: 4634 . a - pointer to the device data 4635 4636 Level: developer 4637 4638 Note: 4639 Does not trigger host-device copies and flags data validity on the GPU 4640 4641 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4642 @*/ 4643 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4644 { 4645 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4646 CsrMatrix *csr; 4647 4648 PetscFunctionBegin; 4649 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4650 PetscAssertPointer(a, 2); 4651 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4652 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4653 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4654 csr = (CsrMatrix *)cusp->mat->mat; 4655 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4656 *a = csr->values->data().get(); 4657 A->offloadmask = PETSC_OFFLOAD_GPU; 4658 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4659 PetscFunctionReturn(PETSC_SUCCESS); 4660 } 4661 4662 /*@C 4663 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4664 4665 Not Collective 4666 4667 Input Parameters: 4668 + A - a `MATSEQAIJCUSPARSE` matrix 4669 - a - pointer to the device data 4670 4671 Level: developer 4672 4673 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4674 @*/ 4675 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4676 { 4677 PetscFunctionBegin; 4678 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4679 PetscAssertPointer(a, 2); 4680 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4681 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4682 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4683 *a = NULL; 4684 PetscFunctionReturn(PETSC_SUCCESS); 4685 } 4686 4687 struct IJCompare4 { 4688 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4689 { 4690 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4691 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4692 return false; 4693 } 4694 }; 4695 4696 struct Shift { 4697 int _shift; 4698 4699 Shift(int shift) : _shift(shift) { } 4700 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4701 }; 4702 4703 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4704 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4705 { 4706 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4707 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4708 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4709 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4710 PetscInt Annz, Bnnz; 4711 cusparseStatus_t stat; 4712 PetscInt i, m, n, zero = 0; 4713 4714 PetscFunctionBegin; 4715 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4716 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4717 PetscAssertPointer(C, 4); 4718 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4719 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4720 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4721 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4722 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4723 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4724 if (reuse == MAT_INITIAL_MATRIX) { 4725 m = A->rmap->n; 4726 n = A->cmap->n + B->cmap->n; 4727 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4728 PetscCall(MatSetSizes(*C, m, n, m, n)); 4729 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4730 c = (Mat_SeqAIJ *)(*C)->data; 4731 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4732 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4733 Ccsr = new CsrMatrix; 4734 Cmat->cprowIndices = NULL; 4735 c->compressedrow.use = PETSC_FALSE; 4736 c->compressedrow.nrows = 0; 4737 c->compressedrow.i = NULL; 4738 c->compressedrow.rindex = NULL; 4739 Ccusp->workVector = NULL; 4740 Ccusp->nrows = m; 4741 Ccusp->mat = Cmat; 4742 Ccusp->mat->mat = Ccsr; 4743 Ccsr->num_rows = m; 4744 Ccsr->num_cols = n; 4745 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4746 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4747 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4748 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4749 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4750 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4751 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4752 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4753 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4754 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4755 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4756 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4757 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4758 4759 Acsr = (CsrMatrix *)Acusp->mat->mat; 4760 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4761 Annz = (PetscInt)Acsr->column_indices->size(); 4762 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4763 c->nz = Annz + Bnnz; 4764 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4765 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4766 Ccsr->values = new THRUSTARRAY(c->nz); 4767 Ccsr->num_entries = c->nz; 4768 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4769 if (c->nz) { 4770 auto Acoo = new THRUSTINTARRAY32(Annz); 4771 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4772 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4773 THRUSTINTARRAY32 *Aroff, *Broff; 4774 4775 if (a->compressedrow.use) { /* need full row offset */ 4776 if (!Acusp->rowoffsets_gpu) { 4777 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4778 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4779 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4780 } 4781 Aroff = Acusp->rowoffsets_gpu; 4782 } else Aroff = Acsr->row_offsets; 4783 if (b->compressedrow.use) { /* need full row offset */ 4784 if (!Bcusp->rowoffsets_gpu) { 4785 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4786 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4787 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4788 } 4789 Broff = Bcusp->rowoffsets_gpu; 4790 } else Broff = Bcsr->row_offsets; 4791 PetscCall(PetscLogGpuTimeBegin()); 4792 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4793 PetscCallCUSPARSE(stat); 4794 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4795 PetscCallCUSPARSE(stat); 4796 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4797 auto Aperm = thrust::make_constant_iterator(1); 4798 auto Bperm = thrust::make_constant_iterator(0); 4799 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4800 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4801 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4802 #else 4803 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4804 auto Bcib = Bcsr->column_indices->begin(); 4805 auto Bcie = Bcsr->column_indices->end(); 4806 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4807 #endif 4808 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4809 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4810 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4811 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4812 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4813 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4814 auto p1 = Ccusp->coords->begin(); 4815 auto p2 = Ccusp->coords->begin(); 4816 thrust::advance(p2, Annz); 4817 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4818 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4819 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4820 #endif 4821 auto cci = thrust::make_counting_iterator(zero); 4822 auto cce = thrust::make_counting_iterator(c->nz); 4823 #if 0 //Errors on SUMMIT cuda 11.1.0 4824 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4825 #else 4826 #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST) 4827 auto pred = thrust::identity<int>(); 4828 #else 4829 auto pred = cuda::std::identity(); 4830 #endif 4831 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4832 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4833 #endif 4834 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4835 PetscCallCUSPARSE(stat); 4836 PetscCall(PetscLogGpuTimeEnd()); 4837 delete wPerm; 4838 delete Acoo; 4839 delete Bcoo; 4840 delete Ccoo; 4841 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4842 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4843 PetscCallCUSPARSE(stat); 4844 #endif 4845 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4846 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4847 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4848 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4849 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4850 CsrMatrix *CcsrT = new CsrMatrix; 4851 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4852 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4853 4854 (*C)->form_explicit_transpose = PETSC_TRUE; 4855 (*C)->transupdated = PETSC_TRUE; 4856 Ccusp->rowoffsets_gpu = NULL; 4857 CmatT->cprowIndices = NULL; 4858 CmatT->mat = CcsrT; 4859 CcsrT->num_rows = n; 4860 CcsrT->num_cols = m; 4861 CcsrT->num_entries = c->nz; 4862 4863 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4864 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4865 CcsrT->values = new THRUSTARRAY(c->nz); 4866 4867 PetscCall(PetscLogGpuTimeBegin()); 4868 auto rT = CcsrT->row_offsets->begin(); 4869 if (AT) { 4870 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4871 thrust::advance(rT, -1); 4872 } 4873 if (BT) { 4874 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4875 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4876 thrust::copy(titb, tite, rT); 4877 } 4878 auto cT = CcsrT->column_indices->begin(); 4879 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4880 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4881 auto vT = CcsrT->values->begin(); 4882 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4883 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4884 PetscCall(PetscLogGpuTimeEnd()); 4885 4886 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4887 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4888 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4889 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4890 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4891 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4892 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4893 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4894 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4895 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4896 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4897 PetscCallCUSPARSE(stat); 4898 #endif 4899 Ccusp->matTranspose = CmatT; 4900 } 4901 } 4902 4903 c->free_a = PETSC_TRUE; 4904 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4905 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4906 c->free_ij = PETSC_TRUE; 4907 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4908 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4909 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4910 ii = *Ccsr->row_offsets; 4911 jj = *Ccsr->column_indices; 4912 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4913 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4914 } else { 4915 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4916 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4917 } 4918 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4919 PetscCall(PetscMalloc1(m, &c->ilen)); 4920 PetscCall(PetscMalloc1(m, &c->imax)); 4921 c->maxnz = c->nz; 4922 c->nonzerorowcnt = 0; 4923 c->rmax = 0; 4924 for (i = 0; i < m; i++) { 4925 const PetscInt nn = c->i[i + 1] - c->i[i]; 4926 c->ilen[i] = c->imax[i] = nn; 4927 c->nonzerorowcnt += (PetscInt)!!nn; 4928 c->rmax = PetscMax(c->rmax, nn); 4929 } 4930 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4931 PetscCall(PetscMalloc1(c->nz, &c->a)); 4932 (*C)->nonzerostate++; 4933 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4934 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4935 Ccusp->nonzerostate = (*C)->nonzerostate; 4936 (*C)->preallocated = PETSC_TRUE; 4937 } else { 4938 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4939 c = (Mat_SeqAIJ *)(*C)->data; 4940 if (c->nz) { 4941 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4942 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4943 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4944 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4945 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4946 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4947 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4948 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4949 Acsr = (CsrMatrix *)Acusp->mat->mat; 4950 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4951 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4952 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4953 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4954 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4955 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4956 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4957 auto pmid = Ccusp->coords->begin(); 4958 thrust::advance(pmid, Acsr->num_entries); 4959 PetscCall(PetscLogGpuTimeBegin()); 4960 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4961 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4962 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4963 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4964 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4965 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4966 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4967 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4968 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4969 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4970 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4971 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4972 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4973 auto vT = CcsrT->values->begin(); 4974 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4975 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4976 (*C)->transupdated = PETSC_TRUE; 4977 } 4978 PetscCall(PetscLogGpuTimeEnd()); 4979 } 4980 } 4981 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4982 (*C)->assembled = PETSC_TRUE; 4983 (*C)->was_assembled = PETSC_FALSE; 4984 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4985 PetscFunctionReturn(PETSC_SUCCESS); 4986 } 4987 4988 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4989 { 4990 bool dmem; 4991 const PetscScalar *av; 4992 4993 PetscFunctionBegin; 4994 dmem = isCudaMem(v); 4995 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4996 if (n && idx) { 4997 THRUSTINTARRAY widx(n); 4998 widx.assign(idx, idx + n); 4999 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5000 5001 THRUSTARRAY *w = NULL; 5002 thrust::device_ptr<PetscScalar> dv; 5003 if (dmem) { 5004 dv = thrust::device_pointer_cast(v); 5005 } else { 5006 w = new THRUSTARRAY(n); 5007 dv = w->data(); 5008 } 5009 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5010 5011 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5012 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5013 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5014 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5015 delete w; 5016 } else { 5017 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5018 } 5019 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5020 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5021 PetscFunctionReturn(PETSC_SUCCESS); 5022 } 5023 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 5024