1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30 31 typedef enum { 32 CUSPARSE_MV_ALG_DEFAULT = 0, 33 CUSPARSE_COOMV_ALG = 1, 34 CUSPARSE_CSRMV_ALG1 = 2, 35 CUSPARSE_CSRMV_ALG2 = 3 36 } cusparseSpMVAlg_t; 37 38 typedef enum { 39 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44 CUSPARSE_SPMM_ALG_DEFAULT = 0, 45 CUSPARSE_SPMM_COO_ALG1 = 1, 46 CUSPARSE_SPMM_COO_ALG2 = 2, 47 CUSPARSE_SPMM_COO_ALG3 = 3, 48 CUSPARSE_SPMM_COO_ALG4 = 5, 49 CUSPARSE_SPMM_CSR_ALG1 = 4, 50 CUSPARSE_SPMM_CSR_ALG2 = 6, 51 } cusparseSpMMAlg_t; 52 53 typedef enum { 54 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 55 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 56 } cusparseCsr2CscAlg_t; 57 */ 58 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61 #endif 62 63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 67 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 73 #endif 74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 84 85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 87 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 88 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 89 90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 92 93 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 94 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 95 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 96 97 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 98 { 99 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 100 101 PetscFunctionBegin; 102 switch (op) { 103 case MAT_CUSPARSE_MULT: 104 cusparsestruct->format = format; 105 break; 106 case MAT_CUSPARSE_ALL: 107 cusparsestruct->format = format; 108 break; 109 default: 110 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 111 } 112 PetscFunctionReturn(PETSC_SUCCESS); 113 } 114 115 /*@ 116 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 117 operation. Only the `MatMult()` operation can use different GPU storage formats 118 119 Not Collective 120 121 Input Parameters: 122 + A - Matrix of type `MATSEQAIJCUSPARSE` 123 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 124 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 125 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 126 127 Level: intermediate 128 129 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 130 @*/ 131 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 132 { 133 PetscFunctionBegin; 134 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 135 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 140 { 141 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 142 143 PetscFunctionBegin; 144 cusparsestruct->use_cpu_solve = use_cpu; 145 PetscFunctionReturn(PETSC_SUCCESS); 146 } 147 148 /*@ 149 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 150 151 Input Parameters: 152 + A - Matrix of type `MATSEQAIJCUSPARSE` 153 - use_cpu - set flag for using the built-in CPU `MatSolve()` 154 155 Level: intermediate 156 157 Note: 158 The cuSparse LU solver currently computes the factors with the built-in CPU method 159 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 160 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 161 162 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 163 @*/ 164 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 165 { 166 PetscFunctionBegin; 167 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 168 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171 172 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 173 { 174 PetscFunctionBegin; 175 switch (op) { 176 case MAT_FORM_EXPLICIT_TRANSPOSE: 177 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 178 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 179 A->form_explicit_transpose = flg; 180 break; 181 default: 182 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 183 break; 184 } 185 PetscFunctionReturn(PETSC_SUCCESS); 186 } 187 188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 189 { 190 MatCUSPARSEStorageFormat format; 191 PetscBool flg; 192 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 193 194 PetscFunctionBegin; 195 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 196 if (A->factortype == MAT_FACTOR_NONE) { 197 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 198 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 199 200 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 201 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 202 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 203 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 205 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 206 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 207 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 208 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 209 #else 210 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 211 #endif 212 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 213 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 214 215 PetscCall( 216 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 217 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 218 #endif 219 } 220 PetscOptionsHeadEnd(); 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 225 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 226 { 227 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 228 PetscInt m = A->rmap->n; 229 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 230 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 231 const MatScalar *Aa = a->a; 232 PetscInt *Mi, *Mj, Mnz; 233 PetscScalar *Ma; 234 235 PetscFunctionBegin; 236 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 237 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 238 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 239 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 240 PetscCall(PetscMalloc1(m + 1, &Mi)); 241 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 242 PetscCall(PetscMalloc1(Mnz, &Ma)); 243 Mi[0] = 0; 244 for (PetscInt i = 0; i < m; i++) { 245 PetscInt llen = Ai[i + 1] - Ai[i]; 246 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 247 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 248 Mj[Mi[i] + llen] = i; // diagonal entry 249 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 250 Mi[i + 1] = Mi[i] + llen + ulen; 251 } 252 // Copy M (L,U) from host to device 253 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 254 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 255 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 256 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 257 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 258 259 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 260 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 261 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 262 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 263 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 264 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 265 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 266 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 267 268 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 269 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 270 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 271 272 fillMode = CUSPARSE_FILL_MODE_UPPER; 273 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 274 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 275 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 276 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 277 278 // Allocate work vectors in SpSv 279 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 280 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 281 282 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 283 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 284 285 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 286 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 287 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 288 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 289 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 290 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 291 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 292 293 // Record for reuse 294 fs->csrRowPtr_h = Mi; 295 fs->csrVal_h = Ma; 296 PetscCall(PetscFree(Mj)); 297 } 298 // Copy the value 299 Mi = fs->csrRowPtr_h; 300 Ma = fs->csrVal_h; 301 Mnz = Mi[m]; 302 for (PetscInt i = 0; i < m; i++) { 303 PetscInt llen = Ai[i + 1] - Ai[i]; 304 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 305 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 306 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 307 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 308 } 309 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 310 311 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 312 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 313 314 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 315 316 // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve 317 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 318 } 319 PetscFunctionReturn(PETSC_SUCCESS); 320 } 321 #else 322 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 323 { 324 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 325 PetscInt n = A->rmap->n; 326 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 327 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 328 const PetscInt *ai = a->i, *aj = a->j, *vi; 329 const MatScalar *aa = a->a, *v; 330 PetscInt *AiLo, *AjLo; 331 PetscInt i, nz, nzLower, offset, rowOffset; 332 333 PetscFunctionBegin; 334 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 335 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 336 try { 337 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 338 nzLower = n + ai[n] - ai[1]; 339 if (!loTriFactor) { 340 PetscScalar *AALo; 341 342 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 343 344 /* Allocate Space for the lower triangular matrix */ 345 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 346 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 347 348 /* Fill the lower triangular matrix */ 349 AiLo[0] = (PetscInt)0; 350 AiLo[n] = nzLower; 351 AjLo[0] = (PetscInt)0; 352 AALo[0] = (MatScalar)1.0; 353 v = aa; 354 vi = aj; 355 offset = 1; 356 rowOffset = 1; 357 for (i = 1; i < n; i++) { 358 nz = ai[i + 1] - ai[i]; 359 /* additional 1 for the term on the diagonal */ 360 AiLo[i] = rowOffset; 361 rowOffset += nz + 1; 362 363 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 364 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 365 366 offset += nz; 367 AjLo[offset] = (PetscInt)i; 368 AALo[offset] = (MatScalar)1.0; 369 offset += 1; 370 371 v += nz; 372 vi += nz; 373 } 374 375 /* allocate space for the triangular factor information */ 376 PetscCall(PetscNew(&loTriFactor)); 377 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 378 /* Create the matrix description */ 379 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 380 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 381 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 382 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 383 #else 384 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 385 #endif 386 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 387 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 388 389 /* set the operation */ 390 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 391 392 /* set the matrix */ 393 loTriFactor->csrMat = new CsrMatrix; 394 loTriFactor->csrMat->num_rows = n; 395 loTriFactor->csrMat->num_cols = n; 396 loTriFactor->csrMat->num_entries = nzLower; 397 398 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 399 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 400 401 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 402 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 403 404 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 405 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 406 407 /* Create the solve analysis information */ 408 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 409 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 410 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 411 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 412 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 413 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 414 #endif 415 416 /* perform the solve analysis */ 417 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 418 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 419 PetscCallCUDA(WaitForCUDA()); 420 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 421 422 /* assign the pointer */ 423 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 424 loTriFactor->AA_h = AALo; 425 PetscCallCUDA(cudaFreeHost(AiLo)); 426 PetscCallCUDA(cudaFreeHost(AjLo)); 427 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 428 } else { /* update values only */ 429 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 430 /* Fill the lower triangular matrix */ 431 loTriFactor->AA_h[0] = 1.0; 432 v = aa; 433 vi = aj; 434 offset = 1; 435 for (i = 1; i < n; i++) { 436 nz = ai[i + 1] - ai[i]; 437 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 438 offset += nz; 439 loTriFactor->AA_h[offset] = 1.0; 440 offset += 1; 441 v += nz; 442 } 443 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 444 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 445 } 446 } catch (char *ex) { 447 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 448 } 449 } 450 PetscFunctionReturn(PETSC_SUCCESS); 451 } 452 453 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 454 { 455 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 456 PetscInt n = A->rmap->n; 457 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 458 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 459 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 460 const MatScalar *aa = a->a, *v; 461 PetscInt *AiUp, *AjUp; 462 PetscInt i, nz, nzUpper, offset; 463 464 PetscFunctionBegin; 465 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 466 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 467 try { 468 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 469 nzUpper = adiag[0] - adiag[n]; 470 if (!upTriFactor) { 471 PetscScalar *AAUp; 472 473 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 474 475 /* Allocate Space for the upper triangular matrix */ 476 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 477 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 478 479 /* Fill the upper triangular matrix */ 480 AiUp[0] = (PetscInt)0; 481 AiUp[n] = nzUpper; 482 offset = nzUpper; 483 for (i = n - 1; i >= 0; i--) { 484 v = aa + adiag[i + 1] + 1; 485 vi = aj + adiag[i + 1] + 1; 486 487 /* number of elements NOT on the diagonal */ 488 nz = adiag[i] - adiag[i + 1] - 1; 489 490 /* decrement the offset */ 491 offset -= (nz + 1); 492 493 /* first, set the diagonal elements */ 494 AjUp[offset] = (PetscInt)i; 495 AAUp[offset] = (MatScalar)1. / v[nz]; 496 AiUp[i] = AiUp[i + 1] - (nz + 1); 497 498 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 499 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 500 } 501 502 /* allocate space for the triangular factor information */ 503 PetscCall(PetscNew(&upTriFactor)); 504 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 505 506 /* Create the matrix description */ 507 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 508 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 509 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 510 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 511 #else 512 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 513 #endif 514 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 515 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 516 517 /* set the operation */ 518 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 519 520 /* set the matrix */ 521 upTriFactor->csrMat = new CsrMatrix; 522 upTriFactor->csrMat->num_rows = n; 523 upTriFactor->csrMat->num_cols = n; 524 upTriFactor->csrMat->num_entries = nzUpper; 525 526 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 527 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 528 529 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 530 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 531 532 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 533 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 534 535 /* Create the solve analysis information */ 536 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 537 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 538 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 539 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 540 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 541 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 542 #endif 543 544 /* perform the solve analysis */ 545 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 546 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 547 548 PetscCallCUDA(WaitForCUDA()); 549 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 550 551 /* assign the pointer */ 552 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 553 upTriFactor->AA_h = AAUp; 554 PetscCallCUDA(cudaFreeHost(AiUp)); 555 PetscCallCUDA(cudaFreeHost(AjUp)); 556 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 557 } else { 558 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 559 /* Fill the upper triangular matrix */ 560 offset = nzUpper; 561 for (i = n - 1; i >= 0; i--) { 562 v = aa + adiag[i + 1] + 1; 563 564 /* number of elements NOT on the diagonal */ 565 nz = adiag[i] - adiag[i + 1] - 1; 566 567 /* decrement the offset */ 568 offset -= (nz + 1); 569 570 /* first, set the diagonal elements */ 571 upTriFactor->AA_h[offset] = 1. / v[nz]; 572 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 573 } 574 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 575 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 576 } 577 } catch (char *ex) { 578 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 579 } 580 } 581 PetscFunctionReturn(PETSC_SUCCESS); 582 } 583 #endif 584 585 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 586 { 587 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 588 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 589 IS isrow = a->row, iscol = a->icol; 590 PetscBool row_identity, col_identity; 591 PetscInt n = A->rmap->n; 592 593 PetscFunctionBegin; 594 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 595 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 596 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 597 #else 598 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 599 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 600 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 601 #endif 602 603 cusparseTriFactors->nnz = a->nz; 604 605 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 606 /* lower triangular indices */ 607 PetscCall(ISIdentity(isrow, &row_identity)); 608 if (!row_identity && !cusparseTriFactors->rpermIndices) { 609 const PetscInt *r; 610 611 PetscCall(ISGetIndices(isrow, &r)); 612 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 613 cusparseTriFactors->rpermIndices->assign(r, r + n); 614 PetscCall(ISRestoreIndices(isrow, &r)); 615 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 616 } 617 618 /* upper triangular indices */ 619 PetscCall(ISIdentity(iscol, &col_identity)); 620 if (!col_identity && !cusparseTriFactors->cpermIndices) { 621 const PetscInt *c; 622 623 PetscCall(ISGetIndices(iscol, &c)); 624 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 625 cusparseTriFactors->cpermIndices->assign(c, c + n); 626 PetscCall(ISRestoreIndices(iscol, &c)); 627 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 628 } 629 PetscFunctionReturn(PETSC_SUCCESS); 630 } 631 632 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 633 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 634 { 635 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 636 PetscInt m = A->rmap->n; 637 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 638 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 639 const MatScalar *Aa = a->a; 640 PetscInt *Mj, Mnz; 641 PetscScalar *Ma, *D; 642 643 PetscFunctionBegin; 644 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 645 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 646 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 647 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 648 Mnz = Ai[m]; // Unz (with the unit diagonal) 649 PetscCall(PetscMalloc1(Mnz, &Ma)); 650 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 651 PetscCall(PetscMalloc1(m, &D)); // the diagonal 652 for (PetscInt i = 0; i < m; i++) { 653 PetscInt ulen = Ai[i + 1] - Ai[i]; 654 Mj[Ai[i]] = i; // diagonal entry 655 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 656 } 657 // Copy M (U) from host to device 658 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 659 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 660 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 661 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 662 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 663 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 664 665 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 666 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 667 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 668 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 669 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 670 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 671 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 672 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 673 674 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 675 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 676 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 677 678 // Allocate work vectors in SpSv 679 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 680 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 681 682 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 683 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 684 685 // Query buffer sizes for SpSV and then allocate buffers 686 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 687 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 688 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 689 690 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 691 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 692 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 693 694 // Record for reuse 695 fs->csrVal_h = Ma; 696 fs->diag_h = D; 697 PetscCall(PetscFree(Mj)); 698 } 699 // Copy the value 700 Ma = fs->csrVal_h; 701 D = fs->diag_h; 702 Mnz = Ai[m]; 703 for (PetscInt i = 0; i < m; i++) { 704 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 705 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 706 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 707 } 708 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 709 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 710 711 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 712 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 713 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 714 } 715 PetscFunctionReturn(PETSC_SUCCESS); 716 } 717 718 // Solve Ut D U x = b 719 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 720 { 721 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 722 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 723 const PetscScalar *barray; 724 PetscScalar *xarray; 725 thrust::device_ptr<const PetscScalar> bGPU; 726 thrust::device_ptr<PetscScalar> xGPU; 727 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 728 PetscInt m = A->rmap->n; 729 730 PetscFunctionBegin; 731 PetscCall(PetscLogGpuTimeBegin()); 732 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 733 PetscCall(VecCUDAGetArrayRead(b, &barray)); 734 xGPU = thrust::device_pointer_cast(xarray); 735 bGPU = thrust::device_pointer_cast(barray); 736 737 // Reorder b with the row permutation if needed, and wrap the result in fs->X 738 if (fs->rpermIndices) { 739 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 740 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 741 } else { 742 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 743 } 744 745 // Solve Ut Y = X 746 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 747 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 748 749 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 750 // It is basically a vector element-wise multiplication, but cublas does not have it! 751 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 752 753 // Solve U X = Y 754 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 755 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 756 } else { 757 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 758 } 759 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 760 761 // Reorder X with the column permutation if needed, and put the result back to x 762 if (fs->cpermIndices) { 763 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 764 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 765 } 766 767 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 768 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 769 PetscCall(PetscLogGpuTimeEnd()); 770 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 771 PetscFunctionReturn(PETSC_SUCCESS); 772 } 773 #else 774 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 775 { 776 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 777 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 778 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 779 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 780 PetscInt *AiUp, *AjUp; 781 PetscScalar *AAUp; 782 PetscScalar *AALo; 783 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 784 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 785 const PetscInt *ai = b->i, *aj = b->j, *vj; 786 const MatScalar *aa = b->a, *v; 787 788 PetscFunctionBegin; 789 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 790 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 791 try { 792 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 793 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 794 if (!upTriFactor && !loTriFactor) { 795 /* Allocate Space for the upper triangular matrix */ 796 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 797 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 798 799 /* Fill the upper triangular matrix */ 800 AiUp[0] = (PetscInt)0; 801 AiUp[n] = nzUpper; 802 offset = 0; 803 for (i = 0; i < n; i++) { 804 /* set the pointers */ 805 v = aa + ai[i]; 806 vj = aj + ai[i]; 807 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 808 809 /* first, set the diagonal elements */ 810 AjUp[offset] = (PetscInt)i; 811 AAUp[offset] = (MatScalar)1.0 / v[nz]; 812 AiUp[i] = offset; 813 AALo[offset] = (MatScalar)1.0 / v[nz]; 814 815 offset += 1; 816 if (nz > 0) { 817 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 818 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 819 for (j = offset; j < offset + nz; j++) { 820 AAUp[j] = -AAUp[j]; 821 AALo[j] = AAUp[j] / v[nz]; 822 } 823 offset += nz; 824 } 825 } 826 827 /* allocate space for the triangular factor information */ 828 PetscCall(PetscNew(&upTriFactor)); 829 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 830 831 /* Create the matrix description */ 832 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 833 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 834 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 835 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 836 #else 837 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 838 #endif 839 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 840 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 841 842 /* set the matrix */ 843 upTriFactor->csrMat = new CsrMatrix; 844 upTriFactor->csrMat->num_rows = A->rmap->n; 845 upTriFactor->csrMat->num_cols = A->cmap->n; 846 upTriFactor->csrMat->num_entries = a->nz; 847 848 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 849 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 850 851 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 852 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 853 854 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 855 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 856 857 /* set the operation */ 858 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 859 860 /* Create the solve analysis information */ 861 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 862 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 863 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 864 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 865 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 866 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 867 #endif 868 869 /* perform the solve analysis */ 870 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 871 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 872 873 PetscCallCUDA(WaitForCUDA()); 874 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 875 876 /* assign the pointer */ 877 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 878 879 /* allocate space for the triangular factor information */ 880 PetscCall(PetscNew(&loTriFactor)); 881 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 882 883 /* Create the matrix description */ 884 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 885 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 886 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 887 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 888 #else 889 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 890 #endif 891 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 892 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 893 894 /* set the operation */ 895 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 896 897 /* set the matrix */ 898 loTriFactor->csrMat = new CsrMatrix; 899 loTriFactor->csrMat->num_rows = A->rmap->n; 900 loTriFactor->csrMat->num_cols = A->cmap->n; 901 loTriFactor->csrMat->num_entries = a->nz; 902 903 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 904 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 905 906 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 907 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 908 909 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 910 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 911 912 /* Create the solve analysis information */ 913 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 914 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 915 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 916 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 917 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 918 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 919 #endif 920 921 /* perform the solve analysis */ 922 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 923 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 924 925 PetscCallCUDA(WaitForCUDA()); 926 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 927 928 /* assign the pointer */ 929 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 930 931 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 932 PetscCallCUDA(cudaFreeHost(AiUp)); 933 PetscCallCUDA(cudaFreeHost(AjUp)); 934 } else { 935 /* Fill the upper triangular matrix */ 936 offset = 0; 937 for (i = 0; i < n; i++) { 938 /* set the pointers */ 939 v = aa + ai[i]; 940 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 941 942 /* first, set the diagonal elements */ 943 AAUp[offset] = 1.0 / v[nz]; 944 AALo[offset] = 1.0 / v[nz]; 945 946 offset += 1; 947 if (nz > 0) { 948 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 949 for (j = offset; j < offset + nz; j++) { 950 AAUp[j] = -AAUp[j]; 951 AALo[j] = AAUp[j] / v[nz]; 952 } 953 offset += nz; 954 } 955 } 956 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 957 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 958 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 959 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 960 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 961 } 962 PetscCallCUDA(cudaFreeHost(AAUp)); 963 PetscCallCUDA(cudaFreeHost(AALo)); 964 } catch (char *ex) { 965 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 966 } 967 } 968 PetscFunctionReturn(PETSC_SUCCESS); 969 } 970 #endif 971 972 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 973 { 974 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 975 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 976 IS ip = a->row; 977 PetscBool perm_identity; 978 PetscInt n = A->rmap->n; 979 980 PetscFunctionBegin; 981 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 982 983 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 984 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 985 #else 986 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 987 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 988 #endif 989 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 990 991 A->offloadmask = PETSC_OFFLOAD_BOTH; 992 993 /* lower triangular indices */ 994 PetscCall(ISIdentity(ip, &perm_identity)); 995 if (!perm_identity) { 996 IS iip; 997 const PetscInt *irip, *rip; 998 999 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 1000 PetscCall(ISGetIndices(iip, &irip)); 1001 PetscCall(ISGetIndices(ip, &rip)); 1002 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1003 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1004 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1005 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1006 PetscCall(ISRestoreIndices(iip, &irip)); 1007 PetscCall(ISDestroy(&iip)); 1008 PetscCall(ISRestoreIndices(ip, &rip)); 1009 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1010 } 1011 PetscFunctionReturn(PETSC_SUCCESS); 1012 } 1013 1014 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1015 { 1016 PetscFunctionBegin; 1017 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1018 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1019 B->offloadmask = PETSC_OFFLOAD_CPU; 1020 1021 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1022 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1023 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1024 #else 1025 /* determine which version of MatSolve needs to be used. */ 1026 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1027 IS ip = b->row; 1028 PetscBool perm_identity; 1029 1030 PetscCall(ISIdentity(ip, &perm_identity)); 1031 if (perm_identity) { 1032 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1033 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1034 } else { 1035 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1036 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1037 } 1038 #endif 1039 B->ops->matsolve = NULL; 1040 B->ops->matsolvetranspose = NULL; 1041 1042 /* get the triangular factors */ 1043 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1044 PetscFunctionReturn(PETSC_SUCCESS); 1045 } 1046 1047 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1048 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1049 { 1050 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1051 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1052 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1053 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1054 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1055 cusparseIndexBase_t indexBase; 1056 cusparseMatrixType_t matrixType; 1057 cusparseFillMode_t fillMode; 1058 cusparseDiagType_t diagType; 1059 1060 PetscFunctionBegin; 1061 /* allocate space for the transpose of the lower triangular factor */ 1062 PetscCall(PetscNew(&loTriFactorT)); 1063 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1064 1065 /* set the matrix descriptors of the lower triangular factor */ 1066 matrixType = cusparseGetMatType(loTriFactor->descr); 1067 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1068 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1069 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1070 1071 /* Create the matrix description */ 1072 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1073 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1074 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1075 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1076 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1077 1078 /* set the operation */ 1079 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1080 1081 /* allocate GPU space for the CSC of the lower triangular factor*/ 1082 loTriFactorT->csrMat = new CsrMatrix; 1083 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1084 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1085 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1086 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1087 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1088 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1089 1090 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1091 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1092 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1093 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1094 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1095 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1096 #endif 1097 1098 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1099 { 1100 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1101 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1102 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1103 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1104 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1105 #else 1106 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1107 #endif 1108 PetscCallCUSPARSE(stat); 1109 } 1110 1111 PetscCallCUDA(WaitForCUDA()); 1112 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1113 1114 /* Create the solve analysis information */ 1115 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1116 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1117 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1118 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1119 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1120 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1121 #endif 1122 1123 /* perform the solve analysis */ 1124 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1125 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1126 1127 PetscCallCUDA(WaitForCUDA()); 1128 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1129 1130 /* assign the pointer */ 1131 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1132 1133 /*********************************************/ 1134 /* Now the Transpose of the Upper Tri Factor */ 1135 /*********************************************/ 1136 1137 /* allocate space for the transpose of the upper triangular factor */ 1138 PetscCall(PetscNew(&upTriFactorT)); 1139 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1140 1141 /* set the matrix descriptors of the upper triangular factor */ 1142 matrixType = cusparseGetMatType(upTriFactor->descr); 1143 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1144 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1145 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1146 1147 /* Create the matrix description */ 1148 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1149 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1150 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1151 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1152 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1153 1154 /* set the operation */ 1155 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1156 1157 /* allocate GPU space for the CSC of the upper triangular factor*/ 1158 upTriFactorT->csrMat = new CsrMatrix; 1159 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1160 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1161 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1162 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1163 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1164 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1165 1166 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1167 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1168 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1169 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1170 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1171 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1172 #endif 1173 1174 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1175 { 1176 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1177 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1178 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1180 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1181 #else 1182 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1183 #endif 1184 PetscCallCUSPARSE(stat); 1185 } 1186 1187 PetscCallCUDA(WaitForCUDA()); 1188 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1189 1190 /* Create the solve analysis information */ 1191 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1192 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1193 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1194 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1195 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1196 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1197 #endif 1198 1199 /* perform the solve analysis */ 1200 /* christ, would it have killed you to put this stuff in a function????????? */ 1201 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1202 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1203 1204 PetscCallCUDA(WaitForCUDA()); 1205 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1206 1207 /* assign the pointer */ 1208 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 #endif 1212 1213 struct PetscScalarToPetscInt { 1214 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1215 }; 1216 1217 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1218 { 1219 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1220 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1221 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1222 cusparseStatus_t stat; 1223 cusparseIndexBase_t indexBase; 1224 1225 PetscFunctionBegin; 1226 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1227 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1228 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1229 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1230 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1231 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1232 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1233 PetscCall(PetscLogGpuTimeBegin()); 1234 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1235 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1236 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1237 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1238 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1239 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1240 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1241 1242 /* set alpha and beta */ 1243 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1244 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1245 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1246 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1247 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1248 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1249 1250 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1251 CsrMatrix *matrixT = new CsrMatrix; 1252 matstructT->mat = matrixT; 1253 matrixT->num_rows = A->cmap->n; 1254 matrixT->num_cols = A->rmap->n; 1255 matrixT->num_entries = a->nz; 1256 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1257 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1258 matrixT->values = new THRUSTARRAY(a->nz); 1259 1260 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1261 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1262 1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1264 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1265 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1266 indexBase, cusparse_scalartype); 1267 PetscCallCUSPARSE(stat); 1268 #else 1269 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1270 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1271 1272 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1273 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1274 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1275 */ 1276 if (matrixT->num_entries) { 1277 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1278 PetscCallCUSPARSE(stat); 1279 1280 } else { 1281 matstructT->matDescr = NULL; 1282 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1283 } 1284 #endif 1285 #endif 1286 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1287 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1288 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1289 #else 1290 CsrMatrix *temp = new CsrMatrix; 1291 CsrMatrix *tempT = new CsrMatrix; 1292 /* First convert HYB to CSR */ 1293 temp->num_rows = A->rmap->n; 1294 temp->num_cols = A->cmap->n; 1295 temp->num_entries = a->nz; 1296 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1297 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1298 temp->values = new THRUSTARRAY(a->nz); 1299 1300 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1301 PetscCallCUSPARSE(stat); 1302 1303 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1304 tempT->num_rows = A->rmap->n; 1305 tempT->num_cols = A->cmap->n; 1306 tempT->num_entries = a->nz; 1307 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1308 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1309 tempT->values = new THRUSTARRAY(a->nz); 1310 1311 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1312 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1313 PetscCallCUSPARSE(stat); 1314 1315 /* Last, convert CSC to HYB */ 1316 cusparseHybMat_t hybMat; 1317 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1318 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1319 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1320 PetscCallCUSPARSE(stat); 1321 1322 /* assign the pointer */ 1323 matstructT->mat = hybMat; 1324 A->transupdated = PETSC_TRUE; 1325 /* delete temporaries */ 1326 if (tempT) { 1327 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1328 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1329 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1330 delete (CsrMatrix *)tempT; 1331 } 1332 if (temp) { 1333 if (temp->values) delete (THRUSTARRAY *)temp->values; 1334 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1335 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1336 delete (CsrMatrix *)temp; 1337 } 1338 #endif 1339 } 1340 } 1341 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1342 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1343 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1344 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1345 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1346 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1347 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1348 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1349 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1350 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1351 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1352 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1353 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1354 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1355 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1356 } 1357 if (!cusparsestruct->csr2csc_i) { 1358 THRUSTARRAY csr2csc_a(matrix->num_entries); 1359 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1360 1361 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1362 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1363 void *csr2cscBuffer; 1364 size_t csr2cscBufferSize; 1365 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1366 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1367 PetscCallCUSPARSE(stat); 1368 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1369 #endif 1370 1371 if (matrix->num_entries) { 1372 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1373 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1374 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1375 1376 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1377 should be filled with indexBase. So I just take a shortcut here. 1378 */ 1379 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1380 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1381 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1382 PetscCallCUSPARSE(stat); 1383 #else 1384 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1385 PetscCallCUSPARSE(stat); 1386 #endif 1387 } else { 1388 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1389 } 1390 1391 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1392 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1393 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1394 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1395 #endif 1396 } 1397 PetscCallThrust( 1398 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1399 } 1400 PetscCall(PetscLogGpuTimeEnd()); 1401 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1402 /* the compressed row indices is not used for matTranspose */ 1403 matstructT->cprowIndices = NULL; 1404 /* assign the pointer */ 1405 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1406 A->transupdated = PETSC_TRUE; 1407 PetscFunctionReturn(PETSC_SUCCESS); 1408 } 1409 1410 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1411 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1412 { 1413 const PetscScalar *barray; 1414 PetscScalar *xarray; 1415 thrust::device_ptr<const PetscScalar> bGPU; 1416 thrust::device_ptr<PetscScalar> xGPU; 1417 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1418 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1419 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1420 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1421 PetscInt m = A->rmap->n; 1422 1423 PetscFunctionBegin; 1424 PetscCall(PetscLogGpuTimeBegin()); 1425 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1426 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1427 xGPU = thrust::device_pointer_cast(xarray); 1428 bGPU = thrust::device_pointer_cast(barray); 1429 1430 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1431 if (fs->rpermIndices) { 1432 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1433 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1434 } else { 1435 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1436 } 1437 1438 // Solve L Y = X 1439 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1440 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1441 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1442 1443 // Solve U X = Y 1444 if (fs->cpermIndices) { 1445 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1446 } else { 1447 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1448 } 1449 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1450 1451 // Reorder X with the column permutation if needed, and put the result back to x 1452 if (fs->cpermIndices) { 1453 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1454 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1455 } 1456 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1457 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1458 PetscCall(PetscLogGpuTimeEnd()); 1459 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1460 PetscFunctionReturn(PETSC_SUCCESS); 1461 } 1462 1463 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1464 { 1465 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1466 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1467 const PetscScalar *barray; 1468 PetscScalar *xarray; 1469 thrust::device_ptr<const PetscScalar> bGPU; 1470 thrust::device_ptr<PetscScalar> xGPU; 1471 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1472 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1473 PetscInt m = A->rmap->n; 1474 1475 PetscFunctionBegin; 1476 PetscCall(PetscLogGpuTimeBegin()); 1477 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1478 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1479 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1480 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1481 1482 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1483 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1484 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1485 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1486 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1487 } 1488 1489 if (!fs->updatedTransposeSpSVAnalysis) { 1490 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1491 1492 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1493 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1494 } 1495 1496 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1497 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1498 xGPU = thrust::device_pointer_cast(xarray); 1499 bGPU = thrust::device_pointer_cast(barray); 1500 1501 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1502 if (fs->rpermIndices) { 1503 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1504 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1505 } else { 1506 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1507 } 1508 1509 // Solve Ut Y = X 1510 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1511 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1512 1513 // Solve Lt X = Y 1514 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1515 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1516 } else { 1517 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1518 } 1519 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1520 1521 // Reorder X with the column permutation if needed, and put the result back to x 1522 if (fs->cpermIndices) { 1523 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1524 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1525 } 1526 1527 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1528 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1529 PetscCall(PetscLogGpuTimeEnd()); 1530 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1531 PetscFunctionReturn(PETSC_SUCCESS); 1532 } 1533 #else 1534 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1535 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1536 { 1537 PetscInt n = xx->map->n; 1538 const PetscScalar *barray; 1539 PetscScalar *xarray; 1540 thrust::device_ptr<const PetscScalar> bGPU; 1541 thrust::device_ptr<PetscScalar> xGPU; 1542 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1543 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1544 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1545 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1546 1547 PetscFunctionBegin; 1548 /* Analyze the matrix and create the transpose ... on the fly */ 1549 if (!loTriFactorT && !upTriFactorT) { 1550 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1551 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1552 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1553 } 1554 1555 /* Get the GPU pointers */ 1556 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1557 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1558 xGPU = thrust::device_pointer_cast(xarray); 1559 bGPU = thrust::device_pointer_cast(barray); 1560 1561 PetscCall(PetscLogGpuTimeBegin()); 1562 /* First, reorder with the row permutation */ 1563 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1564 1565 /* First, solve U */ 1566 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1567 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1568 1569 /* Then, solve L */ 1570 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1571 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1572 1573 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1574 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1575 1576 /* Copy the temporary to the full solution. */ 1577 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1578 1579 /* restore */ 1580 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1581 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1582 PetscCall(PetscLogGpuTimeEnd()); 1583 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1584 PetscFunctionReturn(PETSC_SUCCESS); 1585 } 1586 1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1588 { 1589 const PetscScalar *barray; 1590 PetscScalar *xarray; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1595 1596 PetscFunctionBegin; 1597 /* Analyze the matrix and create the transpose ... on the fly */ 1598 if (!loTriFactorT && !upTriFactorT) { 1599 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1600 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1601 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1602 } 1603 1604 /* Get the GPU pointers */ 1605 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1606 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1607 1608 PetscCall(PetscLogGpuTimeBegin()); 1609 /* First, solve U */ 1610 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1611 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1612 1613 /* Then, solve L */ 1614 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1615 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1616 1617 /* restore */ 1618 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1619 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1620 PetscCall(PetscLogGpuTimeEnd()); 1621 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1622 PetscFunctionReturn(PETSC_SUCCESS); 1623 } 1624 1625 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1626 { 1627 const PetscScalar *barray; 1628 PetscScalar *xarray; 1629 thrust::device_ptr<const PetscScalar> bGPU; 1630 thrust::device_ptr<PetscScalar> xGPU; 1631 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1633 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1634 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1635 1636 PetscFunctionBegin; 1637 /* Get the GPU pointers */ 1638 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1639 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1640 xGPU = thrust::device_pointer_cast(xarray); 1641 bGPU = thrust::device_pointer_cast(barray); 1642 1643 PetscCall(PetscLogGpuTimeBegin()); 1644 /* First, reorder with the row permutation */ 1645 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1646 1647 /* Next, solve L */ 1648 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1649 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1650 1651 /* Then, solve U */ 1652 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1653 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1654 1655 /* Last, reorder with the column permutation */ 1656 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1657 1658 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1659 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1660 PetscCall(PetscLogGpuTimeEnd()); 1661 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1662 PetscFunctionReturn(PETSC_SUCCESS); 1663 } 1664 1665 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1666 { 1667 const PetscScalar *barray; 1668 PetscScalar *xarray; 1669 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1670 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1671 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1672 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1673 1674 PetscFunctionBegin; 1675 /* Get the GPU pointers */ 1676 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1677 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1678 1679 PetscCall(PetscLogGpuTimeBegin()); 1680 /* First, solve L */ 1681 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1682 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1683 1684 /* Next, solve U */ 1685 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1686 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1687 1688 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1689 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1690 PetscCall(PetscLogGpuTimeEnd()); 1691 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1692 PetscFunctionReturn(PETSC_SUCCESS); 1693 } 1694 #endif 1695 1696 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1697 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1698 { 1699 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1700 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1701 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1702 CsrMatrix *Acsr; 1703 PetscInt m, nz; 1704 PetscBool flg; 1705 1706 PetscFunctionBegin; 1707 if (PetscDefined(USE_DEBUG)) { 1708 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1709 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1710 } 1711 1712 /* Copy A's value to fact */ 1713 m = fact->rmap->n; 1714 nz = aij->nz; 1715 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1716 Acsr = (CsrMatrix *)Acusp->mat->mat; 1717 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1718 1719 PetscCall(PetscLogGpuTimeBegin()); 1720 /* Factorize fact inplace */ 1721 if (m) 1722 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1723 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1724 if (PetscDefined(USE_DEBUG)) { 1725 int numerical_zero; 1726 cusparseStatus_t status; 1727 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1728 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1729 } 1730 1731 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1732 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1733 */ 1734 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1735 1736 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1737 1738 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1739 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1740 1741 fact->offloadmask = PETSC_OFFLOAD_GPU; 1742 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1743 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1744 fact->ops->matsolve = NULL; 1745 fact->ops->matsolvetranspose = NULL; 1746 PetscCall(PetscLogGpuTimeEnd()); 1747 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1748 PetscFunctionReturn(PETSC_SUCCESS); 1749 } 1750 1751 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1752 { 1753 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1754 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1755 PetscInt m, nz; 1756 1757 PetscFunctionBegin; 1758 if (PetscDefined(USE_DEBUG)) { 1759 PetscInt i; 1760 PetscBool flg, missing; 1761 1762 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1763 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1764 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1765 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1766 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1767 } 1768 1769 /* Free the old stale stuff */ 1770 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1771 1772 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1773 but they will not be used. Allocate them just for easy debugging. 1774 */ 1775 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1776 1777 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1778 fact->factortype = MAT_FACTOR_ILU; 1779 fact->info.factor_mallocs = 0; 1780 fact->info.fill_ratio_given = info->fill; 1781 fact->info.fill_ratio_needed = 1.0; 1782 1783 aij->row = NULL; 1784 aij->col = NULL; 1785 1786 /* ====================================================================== */ 1787 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1788 /* We'll do in-place factorization on fact */ 1789 /* ====================================================================== */ 1790 const int *Ai, *Aj; 1791 1792 m = fact->rmap->n; 1793 nz = aij->nz; 1794 1795 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1796 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1797 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1798 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1799 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1800 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1801 1802 /* ====================================================================== */ 1803 /* Create descriptors for M, L, U */ 1804 /* ====================================================================== */ 1805 cusparseFillMode_t fillMode; 1806 cusparseDiagType_t diagType; 1807 1808 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1809 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1810 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1811 1812 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1813 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1814 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1815 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1816 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1817 */ 1818 fillMode = CUSPARSE_FILL_MODE_LOWER; 1819 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1820 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1821 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1822 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1823 1824 fillMode = CUSPARSE_FILL_MODE_UPPER; 1825 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1826 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1827 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1828 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1829 1830 /* ========================================================================= */ 1831 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1832 /* ========================================================================= */ 1833 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1834 if (m) 1835 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1836 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1837 1838 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1839 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1840 1841 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1842 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1843 1844 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1845 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1846 1847 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1848 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1849 1850 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1851 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1852 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1853 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1854 */ 1855 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1856 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1857 fs->spsvBuffer_L = fs->factBuffer_M; 1858 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1859 } else { 1860 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1861 fs->spsvBuffer_U = fs->factBuffer_M; 1862 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1863 } 1864 1865 /* ========================================================================== */ 1866 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1867 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1868 /* ========================================================================== */ 1869 int structural_zero; 1870 cusparseStatus_t status; 1871 1872 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1873 if (m) 1874 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1875 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1876 if (PetscDefined(USE_DEBUG)) { 1877 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1878 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1879 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1880 } 1881 1882 /* Estimate FLOPs of the numeric factorization */ 1883 { 1884 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1885 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1886 PetscLogDouble flops = 0.0; 1887 1888 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1889 Ai = Aseq->i; 1890 Adiag = Aseq->diag; 1891 for (PetscInt i = 0; i < m; i++) { 1892 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1893 nzRow = Ai[i + 1] - Ai[i]; 1894 nzLeft = Adiag[i] - Ai[i]; 1895 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1896 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1897 */ 1898 nzLeft = (nzRow - 1) / 2; 1899 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1900 } 1901 } 1902 fs->numericFactFlops = flops; 1903 } 1904 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1905 PetscFunctionReturn(PETSC_SUCCESS); 1906 } 1907 1908 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1909 { 1910 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1911 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1912 const PetscScalar *barray; 1913 PetscScalar *xarray; 1914 1915 PetscFunctionBegin; 1916 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1917 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1918 PetscCall(PetscLogGpuTimeBegin()); 1919 1920 /* Solve L*y = b */ 1921 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1922 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1923 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1924 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1925 1926 /* Solve Lt*x = y */ 1927 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1928 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1929 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1930 1931 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1932 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1933 1934 PetscCall(PetscLogGpuTimeEnd()); 1935 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1936 PetscFunctionReturn(PETSC_SUCCESS); 1937 } 1938 1939 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1940 { 1941 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1942 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1943 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1944 CsrMatrix *Acsr; 1945 PetscInt m, nz; 1946 PetscBool flg; 1947 1948 PetscFunctionBegin; 1949 if (PetscDefined(USE_DEBUG)) { 1950 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1951 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1952 } 1953 1954 /* Copy A's value to fact */ 1955 m = fact->rmap->n; 1956 nz = aij->nz; 1957 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1958 Acsr = (CsrMatrix *)Acusp->mat->mat; 1959 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1960 1961 /* Factorize fact inplace */ 1962 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1963 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1964 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1965 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1966 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1967 */ 1968 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1969 if (PetscDefined(USE_DEBUG)) { 1970 int numerical_zero; 1971 cusparseStatus_t status; 1972 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1973 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1974 } 1975 1976 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1977 1978 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1979 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1980 */ 1981 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1982 1983 fact->offloadmask = PETSC_OFFLOAD_GPU; 1984 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1985 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1986 fact->ops->matsolve = NULL; 1987 fact->ops->matsolvetranspose = NULL; 1988 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1989 PetscFunctionReturn(PETSC_SUCCESS); 1990 } 1991 1992 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1993 { 1994 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1995 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1996 PetscInt m, nz; 1997 1998 PetscFunctionBegin; 1999 if (PetscDefined(USE_DEBUG)) { 2000 PetscInt i; 2001 PetscBool flg, missing; 2002 2003 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2004 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2005 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2006 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2007 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2008 } 2009 2010 /* Free the old stale stuff */ 2011 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2012 2013 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2014 but they will not be used. Allocate them just for easy debugging. 2015 */ 2016 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2017 2018 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2019 fact->factortype = MAT_FACTOR_ICC; 2020 fact->info.factor_mallocs = 0; 2021 fact->info.fill_ratio_given = info->fill; 2022 fact->info.fill_ratio_needed = 1.0; 2023 2024 aij->row = NULL; 2025 aij->col = NULL; 2026 2027 /* ====================================================================== */ 2028 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2029 /* We'll do in-place factorization on fact */ 2030 /* ====================================================================== */ 2031 const int *Ai, *Aj; 2032 2033 m = fact->rmap->n; 2034 nz = aij->nz; 2035 2036 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2037 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2038 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2039 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2040 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2041 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2042 2043 /* ====================================================================== */ 2044 /* Create mat descriptors for M, L */ 2045 /* ====================================================================== */ 2046 cusparseFillMode_t fillMode; 2047 cusparseDiagType_t diagType; 2048 2049 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2050 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2051 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2052 2053 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2054 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2055 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2056 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2057 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2058 */ 2059 fillMode = CUSPARSE_FILL_MODE_LOWER; 2060 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2061 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2062 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2063 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2064 2065 /* ========================================================================= */ 2066 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2067 /* ========================================================================= */ 2068 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2069 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2070 2071 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2072 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2073 2074 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2075 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2076 2077 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2078 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2079 2080 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2081 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2082 2083 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2084 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2085 */ 2086 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2087 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2088 fs->spsvBuffer_L = fs->factBuffer_M; 2089 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2090 } else { 2091 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2092 fs->spsvBuffer_Lt = fs->factBuffer_M; 2093 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2094 } 2095 2096 /* ========================================================================== */ 2097 /* Perform analysis of ic0 on M */ 2098 /* The lower triangular part of M has the same sparsity pattern as L */ 2099 /* ========================================================================== */ 2100 int structural_zero; 2101 cusparseStatus_t status; 2102 2103 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2104 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2105 if (PetscDefined(USE_DEBUG)) { 2106 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2107 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2108 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2109 } 2110 2111 /* Estimate FLOPs of the numeric factorization */ 2112 { 2113 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2114 PetscInt *Ai, nzRow, nzLeft; 2115 PetscLogDouble flops = 0.0; 2116 2117 Ai = Aseq->i; 2118 for (PetscInt i = 0; i < m; i++) { 2119 nzRow = Ai[i + 1] - Ai[i]; 2120 if (nzRow > 1) { 2121 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2122 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2123 */ 2124 nzLeft = (nzRow - 1) / 2; 2125 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2126 } 2127 } 2128 fs->numericFactFlops = flops; 2129 } 2130 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2131 PetscFunctionReturn(PETSC_SUCCESS); 2132 } 2133 #endif 2134 2135 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2136 { 2137 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2138 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2139 2140 PetscFunctionBegin; 2141 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2142 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2143 B->offloadmask = PETSC_OFFLOAD_CPU; 2144 2145 if (!cusparsestruct->use_cpu_solve) { 2146 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2147 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2148 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2149 #else 2150 /* determine which version of MatSolve needs to be used. */ 2151 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2152 IS isrow = b->row, iscol = b->col; 2153 PetscBool row_identity, col_identity; 2154 2155 PetscCall(ISIdentity(isrow, &row_identity)); 2156 PetscCall(ISIdentity(iscol, &col_identity)); 2157 if (row_identity && col_identity) { 2158 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2159 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2160 } else { 2161 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2162 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2163 } 2164 #endif 2165 } 2166 B->ops->matsolve = NULL; 2167 B->ops->matsolvetranspose = NULL; 2168 2169 /* get the triangular factors */ 2170 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2171 PetscFunctionReturn(PETSC_SUCCESS); 2172 } 2173 2174 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2175 { 2176 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2177 2178 PetscFunctionBegin; 2179 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2180 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2181 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2182 PetscFunctionReturn(PETSC_SUCCESS); 2183 } 2184 2185 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2186 { 2187 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2188 2189 PetscFunctionBegin; 2190 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2191 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2192 if (cusparseTriFactors->factorizeOnDevice) { 2193 PetscCall(ISIdentity(isrow, &row_identity)); 2194 PetscCall(ISIdentity(iscol, &col_identity)); 2195 } 2196 if (!info->levels && row_identity && col_identity) { 2197 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2198 } else 2199 #endif 2200 { 2201 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2202 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2203 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2204 } 2205 PetscFunctionReturn(PETSC_SUCCESS); 2206 } 2207 2208 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2209 { 2210 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2211 2212 PetscFunctionBegin; 2213 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2214 PetscBool perm_identity = PETSC_FALSE; 2215 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2216 if (!info->levels && perm_identity) { 2217 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2218 } else 2219 #endif 2220 { 2221 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2222 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2223 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2224 } 2225 PetscFunctionReturn(PETSC_SUCCESS); 2226 } 2227 2228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2229 { 2230 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2231 2232 PetscFunctionBegin; 2233 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2234 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2235 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2236 PetscFunctionReturn(PETSC_SUCCESS); 2237 } 2238 2239 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2240 { 2241 PetscFunctionBegin; 2242 *type = MATSOLVERCUSPARSE; 2243 PetscFunctionReturn(PETSC_SUCCESS); 2244 } 2245 2246 /*MC 2247 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2248 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2249 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2250 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2251 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2252 algorithms are not recommended. This class does NOT support direct solver operations. 2253 2254 Level: beginner 2255 2256 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2257 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2258 M*/ 2259 2260 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2261 { 2262 PetscInt n = A->rmap->n; 2263 PetscBool factOnDevice, factOnHost; 2264 char *prefix; 2265 char factPlace[32] = "device"; /* the default */ 2266 2267 PetscFunctionBegin; 2268 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2269 PetscCall(MatSetSizes(*B, n, n, n, n)); 2270 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2271 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2272 2273 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2274 PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat"); 2275 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2276 PetscOptionsEnd(); 2277 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2278 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2279 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2280 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2281 2282 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2283 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2284 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2285 if (!A->boundtocpu) { 2286 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2287 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2288 } else { 2289 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2290 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2291 } 2292 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2293 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2294 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2295 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2296 if (!A->boundtocpu) { 2297 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2298 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2299 } else { 2300 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2301 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2302 } 2303 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2304 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2305 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2306 2307 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2308 (*B)->canuseordering = PETSC_TRUE; 2309 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2310 PetscFunctionReturn(PETSC_SUCCESS); 2311 } 2312 2313 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2314 { 2315 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2316 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2317 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2318 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2319 #endif 2320 2321 PetscFunctionBegin; 2322 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2323 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2324 if (A->factortype == MAT_FACTOR_NONE) { 2325 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2326 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2327 } 2328 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2329 else if (fs->csrVal) { 2330 /* We have a factorized matrix on device and are able to copy it to host */ 2331 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2332 } 2333 #endif 2334 else 2335 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2336 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2337 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2338 A->offloadmask = PETSC_OFFLOAD_BOTH; 2339 } 2340 PetscFunctionReturn(PETSC_SUCCESS); 2341 } 2342 2343 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2344 { 2345 PetscFunctionBegin; 2346 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2347 *array = ((Mat_SeqAIJ *)A->data)->a; 2348 PetscFunctionReturn(PETSC_SUCCESS); 2349 } 2350 2351 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2352 { 2353 PetscFunctionBegin; 2354 A->offloadmask = PETSC_OFFLOAD_CPU; 2355 *array = NULL; 2356 PetscFunctionReturn(PETSC_SUCCESS); 2357 } 2358 2359 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2360 { 2361 PetscFunctionBegin; 2362 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2363 *array = ((Mat_SeqAIJ *)A->data)->a; 2364 PetscFunctionReturn(PETSC_SUCCESS); 2365 } 2366 2367 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2368 { 2369 PetscFunctionBegin; 2370 *array = NULL; 2371 PetscFunctionReturn(PETSC_SUCCESS); 2372 } 2373 2374 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2375 { 2376 PetscFunctionBegin; 2377 *array = ((Mat_SeqAIJ *)A->data)->a; 2378 PetscFunctionReturn(PETSC_SUCCESS); 2379 } 2380 2381 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2382 { 2383 PetscFunctionBegin; 2384 A->offloadmask = PETSC_OFFLOAD_CPU; 2385 *array = NULL; 2386 PetscFunctionReturn(PETSC_SUCCESS); 2387 } 2388 2389 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2390 { 2391 Mat_SeqAIJCUSPARSE *cusp; 2392 CsrMatrix *matrix; 2393 2394 PetscFunctionBegin; 2395 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2396 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2397 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2398 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2399 matrix = (CsrMatrix *)cusp->mat->mat; 2400 2401 if (i) { 2402 #if !defined(PETSC_USE_64BIT_INDICES) 2403 *i = matrix->row_offsets->data().get(); 2404 #else 2405 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2406 #endif 2407 } 2408 if (j) { 2409 #if !defined(PETSC_USE_64BIT_INDICES) 2410 *j = matrix->column_indices->data().get(); 2411 #else 2412 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2413 #endif 2414 } 2415 if (a) *a = matrix->values->data().get(); 2416 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2417 PetscFunctionReturn(PETSC_SUCCESS); 2418 } 2419 2420 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2421 { 2422 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2423 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2424 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2425 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2426 cusparseStatus_t stat; 2427 PetscBool both = PETSC_TRUE; 2428 2429 PetscFunctionBegin; 2430 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2431 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2432 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2433 CsrMatrix *matrix; 2434 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2435 2436 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2437 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2438 matrix->values->assign(a->a, a->a + a->nz); 2439 PetscCallCUDA(WaitForCUDA()); 2440 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2441 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2442 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2443 } else { 2444 PetscInt nnz; 2445 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2446 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2447 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2448 delete cusparsestruct->workVector; 2449 delete cusparsestruct->rowoffsets_gpu; 2450 cusparsestruct->workVector = NULL; 2451 cusparsestruct->rowoffsets_gpu = NULL; 2452 try { 2453 if (a->compressedrow.use) { 2454 m = a->compressedrow.nrows; 2455 ii = a->compressedrow.i; 2456 ridx = a->compressedrow.rindex; 2457 } else { 2458 m = A->rmap->n; 2459 ii = a->i; 2460 ridx = NULL; 2461 } 2462 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2463 if (!a->a) { 2464 nnz = ii[m]; 2465 both = PETSC_FALSE; 2466 } else nnz = a->nz; 2467 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2468 2469 /* create cusparse matrix */ 2470 cusparsestruct->nrows = m; 2471 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2472 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2473 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2474 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2475 2476 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2477 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2478 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2479 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2480 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2481 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2482 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2483 2484 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2485 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2486 /* set the matrix */ 2487 CsrMatrix *mat = new CsrMatrix; 2488 mat->num_rows = m; 2489 mat->num_cols = A->cmap->n; 2490 mat->num_entries = nnz; 2491 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2492 mat->row_offsets->assign(ii, ii + m + 1); 2493 2494 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2495 mat->column_indices->assign(a->j, a->j + nnz); 2496 2497 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2498 if (a->a) mat->values->assign(a->a, a->a + nnz); 2499 2500 /* assign the pointer */ 2501 matstruct->mat = mat; 2502 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2503 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2504 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2505 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2506 PetscCallCUSPARSE(stat); 2507 } 2508 #endif 2509 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2510 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2511 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2512 #else 2513 CsrMatrix *mat = new CsrMatrix; 2514 mat->num_rows = m; 2515 mat->num_cols = A->cmap->n; 2516 mat->num_entries = nnz; 2517 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2518 mat->row_offsets->assign(ii, ii + m + 1); 2519 2520 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2521 mat->column_indices->assign(a->j, a->j + nnz); 2522 2523 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2524 if (a->a) mat->values->assign(a->a, a->a + nnz); 2525 2526 cusparseHybMat_t hybMat; 2527 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2528 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2529 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2530 PetscCallCUSPARSE(stat); 2531 /* assign the pointer */ 2532 matstruct->mat = hybMat; 2533 2534 if (mat) { 2535 if (mat->values) delete (THRUSTARRAY *)mat->values; 2536 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2537 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2538 delete (CsrMatrix *)mat; 2539 } 2540 #endif 2541 } 2542 2543 /* assign the compressed row indices */ 2544 if (a->compressedrow.use) { 2545 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2546 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2547 matstruct->cprowIndices->assign(ridx, ridx + m); 2548 tmp = m; 2549 } else { 2550 cusparsestruct->workVector = NULL; 2551 matstruct->cprowIndices = NULL; 2552 tmp = 0; 2553 } 2554 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2555 2556 /* assign the pointer */ 2557 cusparsestruct->mat = matstruct; 2558 } catch (char *ex) { 2559 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2560 } 2561 PetscCallCUDA(WaitForCUDA()); 2562 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2563 cusparsestruct->nonzerostate = A->nonzerostate; 2564 } 2565 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2566 } 2567 PetscFunctionReturn(PETSC_SUCCESS); 2568 } 2569 2570 struct VecCUDAPlusEquals { 2571 template <typename Tuple> 2572 __host__ __device__ void operator()(Tuple t) 2573 { 2574 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2575 } 2576 }; 2577 2578 struct VecCUDAEquals { 2579 template <typename Tuple> 2580 __host__ __device__ void operator()(Tuple t) 2581 { 2582 thrust::get<1>(t) = thrust::get<0>(t); 2583 } 2584 }; 2585 2586 struct VecCUDAEqualsReverse { 2587 template <typename Tuple> 2588 __host__ __device__ void operator()(Tuple t) 2589 { 2590 thrust::get<0>(t) = thrust::get<1>(t); 2591 } 2592 }; 2593 2594 struct MatMatCusparse { 2595 PetscBool cisdense; 2596 PetscScalar *Bt; 2597 Mat X; 2598 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2599 PetscLogDouble flops; 2600 CsrMatrix *Bcsr; 2601 2602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2603 cusparseSpMatDescr_t matSpBDescr; 2604 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2605 cusparseDnMatDescr_t matBDescr; 2606 cusparseDnMatDescr_t matCDescr; 2607 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2608 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2609 void *dBuffer4; 2610 void *dBuffer5; 2611 #endif 2612 size_t mmBufferSize; 2613 void *mmBuffer; 2614 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2615 cusparseSpGEMMDescr_t spgemmDesc; 2616 #endif 2617 }; 2618 2619 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2620 { 2621 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2622 2623 PetscFunctionBegin; 2624 PetscCallCUDA(cudaFree(mmdata->Bt)); 2625 delete mmdata->Bcsr; 2626 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2627 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2628 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2629 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2630 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2632 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2633 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2634 #endif 2635 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2636 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2637 #endif 2638 PetscCall(MatDestroy(&mmdata->X)); 2639 PetscCall(PetscFree(data)); 2640 PetscFunctionReturn(PETSC_SUCCESS); 2641 } 2642 2643 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2644 2645 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2646 { 2647 Mat_Product *product = C->product; 2648 Mat A, B; 2649 PetscInt m, n, blda, clda; 2650 PetscBool flg, biscuda; 2651 Mat_SeqAIJCUSPARSE *cusp; 2652 cusparseStatus_t stat; 2653 cusparseOperation_t opA; 2654 const PetscScalar *barray; 2655 PetscScalar *carray; 2656 MatMatCusparse *mmdata; 2657 Mat_SeqAIJCUSPARSEMultStruct *mat; 2658 CsrMatrix *csrmat; 2659 2660 PetscFunctionBegin; 2661 MatCheckProduct(C, 1); 2662 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2663 mmdata = (MatMatCusparse *)product->data; 2664 A = product->A; 2665 B = product->B; 2666 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2667 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2668 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2669 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2670 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2671 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2672 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2673 switch (product->type) { 2674 case MATPRODUCT_AB: 2675 case MATPRODUCT_PtAP: 2676 mat = cusp->mat; 2677 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2678 m = A->rmap->n; 2679 n = B->cmap->n; 2680 break; 2681 case MATPRODUCT_AtB: 2682 if (!A->form_explicit_transpose) { 2683 mat = cusp->mat; 2684 opA = CUSPARSE_OPERATION_TRANSPOSE; 2685 } else { 2686 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2687 mat = cusp->matTranspose; 2688 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2689 } 2690 m = A->cmap->n; 2691 n = B->cmap->n; 2692 break; 2693 case MATPRODUCT_ABt: 2694 case MATPRODUCT_RARt: 2695 mat = cusp->mat; 2696 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2697 m = A->rmap->n; 2698 n = B->rmap->n; 2699 break; 2700 default: 2701 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2702 } 2703 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2704 csrmat = (CsrMatrix *)mat->mat; 2705 /* if the user passed a CPU matrix, copy the data to the GPU */ 2706 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2707 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2708 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2709 2710 PetscCall(MatDenseGetLDA(B, &blda)); 2711 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2712 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2713 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2714 } else { 2715 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2716 PetscCall(MatDenseGetLDA(C, &clda)); 2717 } 2718 2719 PetscCall(PetscLogGpuTimeBegin()); 2720 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2721 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2722 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2723 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2724 #else 2725 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2726 #endif 2727 2728 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2729 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2730 size_t mmBufferSize; 2731 if (mmdata->initialized && mmdata->Blda != blda) { 2732 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2733 mmdata->matBDescr = NULL; 2734 } 2735 if (!mmdata->matBDescr) { 2736 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2737 mmdata->Blda = blda; 2738 } 2739 2740 if (mmdata->initialized && mmdata->Clda != clda) { 2741 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2742 mmdata->matCDescr = NULL; 2743 } 2744 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2745 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2746 mmdata->Clda = clda; 2747 } 2748 2749 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2750 if (matADescr) { 2751 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2752 matADescr = NULL; 2753 } 2754 #endif 2755 2756 if (!matADescr) { 2757 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2758 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2759 PetscCallCUSPARSE(stat); 2760 } 2761 2762 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2763 2764 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2765 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2766 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2767 mmdata->mmBufferSize = mmBufferSize; 2768 } 2769 2770 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0 2771 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2772 #endif 2773 2774 mmdata->initialized = PETSC_TRUE; 2775 } else { 2776 /* to be safe, always update pointers of the mats */ 2777 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2778 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2779 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2780 } 2781 2782 /* do cusparseSpMM, which supports transpose on B */ 2783 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2784 #else 2785 PetscInt k; 2786 /* cusparseXcsrmm does not support transpose on B */ 2787 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2788 cublasHandle_t cublasv2handle; 2789 cublasStatus_t cerr; 2790 2791 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2792 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2793 PetscCallCUBLAS(cerr); 2794 blda = B->cmap->n; 2795 k = B->cmap->n; 2796 } else { 2797 k = B->rmap->n; 2798 } 2799 2800 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2801 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2802 PetscCallCUSPARSE(stat); 2803 #endif 2804 PetscCall(PetscLogGpuTimeEnd()); 2805 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2806 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2807 if (product->type == MATPRODUCT_RARt) { 2808 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2809 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2810 } else if (product->type == MATPRODUCT_PtAP) { 2811 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2812 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2813 } else { 2814 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2815 } 2816 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2817 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2818 PetscFunctionReturn(PETSC_SUCCESS); 2819 } 2820 2821 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2822 { 2823 Mat_Product *product = C->product; 2824 Mat A, B; 2825 PetscInt m, n; 2826 PetscBool cisdense, flg; 2827 MatMatCusparse *mmdata; 2828 Mat_SeqAIJCUSPARSE *cusp; 2829 2830 PetscFunctionBegin; 2831 MatCheckProduct(C, 1); 2832 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2833 A = product->A; 2834 B = product->B; 2835 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2836 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2837 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2838 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2839 switch (product->type) { 2840 case MATPRODUCT_AB: 2841 m = A->rmap->n; 2842 n = B->cmap->n; 2843 break; 2844 case MATPRODUCT_AtB: 2845 m = A->cmap->n; 2846 n = B->cmap->n; 2847 break; 2848 case MATPRODUCT_ABt: 2849 m = A->rmap->n; 2850 n = B->rmap->n; 2851 break; 2852 case MATPRODUCT_PtAP: 2853 m = B->cmap->n; 2854 n = B->cmap->n; 2855 break; 2856 case MATPRODUCT_RARt: 2857 m = B->rmap->n; 2858 n = B->rmap->n; 2859 break; 2860 default: 2861 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2862 } 2863 PetscCall(MatSetSizes(C, m, n, m, n)); 2864 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2865 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2866 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2867 2868 /* product data */ 2869 PetscCall(PetscNew(&mmdata)); 2870 mmdata->cisdense = cisdense; 2871 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2872 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2873 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2874 #endif 2875 /* for these products we need intermediate storage */ 2876 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2877 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2878 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2879 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2880 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2881 } else { 2882 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2883 } 2884 } 2885 C->product->data = mmdata; 2886 C->product->destroy = MatDestroy_MatMatCusparse; 2887 2888 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2889 PetscFunctionReturn(PETSC_SUCCESS); 2890 } 2891 2892 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2893 { 2894 Mat_Product *product = C->product; 2895 Mat A, B; 2896 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2897 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2898 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2899 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2900 PetscBool flg; 2901 cusparseStatus_t stat; 2902 MatProductType ptype; 2903 MatMatCusparse *mmdata; 2904 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2905 cusparseSpMatDescr_t BmatSpDescr; 2906 #endif 2907 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2908 2909 PetscFunctionBegin; 2910 MatCheckProduct(C, 1); 2911 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2912 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2913 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2914 mmdata = (MatMatCusparse *)C->product->data; 2915 A = product->A; 2916 B = product->B; 2917 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2918 mmdata->reusesym = PETSC_FALSE; 2919 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2920 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2921 Cmat = Ccusp->mat; 2922 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2923 Ccsr = (CsrMatrix *)Cmat->mat; 2924 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2925 goto finalize; 2926 } 2927 if (!c->nz) goto finalize; 2928 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2929 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2930 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2931 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2932 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2933 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2934 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2935 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2936 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2937 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2938 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2939 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2940 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2941 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2942 2943 ptype = product->type; 2944 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2945 ptype = MATPRODUCT_AB; 2946 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2947 } 2948 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2949 ptype = MATPRODUCT_AB; 2950 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2951 } 2952 switch (ptype) { 2953 case MATPRODUCT_AB: 2954 Amat = Acusp->mat; 2955 Bmat = Bcusp->mat; 2956 break; 2957 case MATPRODUCT_AtB: 2958 Amat = Acusp->matTranspose; 2959 Bmat = Bcusp->mat; 2960 break; 2961 case MATPRODUCT_ABt: 2962 Amat = Acusp->mat; 2963 Bmat = Bcusp->matTranspose; 2964 break; 2965 default: 2966 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2967 } 2968 Cmat = Ccusp->mat; 2969 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2970 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2971 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2972 Acsr = (CsrMatrix *)Amat->mat; 2973 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2974 Ccsr = (CsrMatrix *)Cmat->mat; 2975 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2976 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2977 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2978 PetscCall(PetscLogGpuTimeBegin()); 2979 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2980 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2981 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2983 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2984 PetscCallCUSPARSE(stat); 2985 #else 2986 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2987 PetscCallCUSPARSE(stat); 2988 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2989 PetscCallCUSPARSE(stat); 2990 #endif 2991 #else 2992 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2993 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2994 PetscCallCUSPARSE(stat); 2995 #endif 2996 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2997 PetscCallCUDA(WaitForCUDA()); 2998 PetscCall(PetscLogGpuTimeEnd()); 2999 C->offloadmask = PETSC_OFFLOAD_GPU; 3000 finalize: 3001 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3002 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3003 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3004 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3005 c->reallocs = 0; 3006 C->info.mallocs += 0; 3007 C->info.nz_unneeded = 0; 3008 C->assembled = C->was_assembled = PETSC_TRUE; 3009 C->num_ass++; 3010 PetscFunctionReturn(PETSC_SUCCESS); 3011 } 3012 3013 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3014 { 3015 Mat_Product *product = C->product; 3016 Mat A, B; 3017 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3018 Mat_SeqAIJ *a, *b, *c; 3019 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3020 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3021 PetscInt i, j, m, n, k; 3022 PetscBool flg; 3023 cusparseStatus_t stat; 3024 MatProductType ptype; 3025 MatMatCusparse *mmdata; 3026 PetscLogDouble flops; 3027 PetscBool biscompressed, ciscompressed; 3028 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3029 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3030 cusparseSpMatDescr_t BmatSpDescr; 3031 #else 3032 int cnz; 3033 #endif 3034 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3035 3036 PetscFunctionBegin; 3037 MatCheckProduct(C, 1); 3038 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3039 A = product->A; 3040 B = product->B; 3041 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3042 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3043 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3044 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3045 a = (Mat_SeqAIJ *)A->data; 3046 b = (Mat_SeqAIJ *)B->data; 3047 /* product data */ 3048 PetscCall(PetscNew(&mmdata)); 3049 C->product->data = mmdata; 3050 C->product->destroy = MatDestroy_MatMatCusparse; 3051 3052 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3053 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3054 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3055 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3056 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3057 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3058 3059 ptype = product->type; 3060 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3061 ptype = MATPRODUCT_AB; 3062 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3063 } 3064 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3065 ptype = MATPRODUCT_AB; 3066 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3067 } 3068 biscompressed = PETSC_FALSE; 3069 ciscompressed = PETSC_FALSE; 3070 switch (ptype) { 3071 case MATPRODUCT_AB: 3072 m = A->rmap->n; 3073 n = B->cmap->n; 3074 k = A->cmap->n; 3075 Amat = Acusp->mat; 3076 Bmat = Bcusp->mat; 3077 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3078 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3079 break; 3080 case MATPRODUCT_AtB: 3081 m = A->cmap->n; 3082 n = B->cmap->n; 3083 k = A->rmap->n; 3084 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3085 Amat = Acusp->matTranspose; 3086 Bmat = Bcusp->mat; 3087 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3088 break; 3089 case MATPRODUCT_ABt: 3090 m = A->rmap->n; 3091 n = B->rmap->n; 3092 k = A->cmap->n; 3093 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3094 Amat = Acusp->mat; 3095 Bmat = Bcusp->matTranspose; 3096 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3097 break; 3098 default: 3099 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3100 } 3101 3102 /* create cusparse matrix */ 3103 PetscCall(MatSetSizes(C, m, n, m, n)); 3104 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3105 c = (Mat_SeqAIJ *)C->data; 3106 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3107 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3108 Ccsr = new CsrMatrix; 3109 3110 c->compressedrow.use = ciscompressed; 3111 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3112 c->compressedrow.nrows = a->compressedrow.nrows; 3113 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3114 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3115 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3116 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3117 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3118 } else { 3119 c->compressedrow.nrows = 0; 3120 c->compressedrow.i = NULL; 3121 c->compressedrow.rindex = NULL; 3122 Ccusp->workVector = NULL; 3123 Cmat->cprowIndices = NULL; 3124 } 3125 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3126 Ccusp->mat = Cmat; 3127 Ccusp->mat->mat = Ccsr; 3128 Ccsr->num_rows = Ccusp->nrows; 3129 Ccsr->num_cols = n; 3130 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3131 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3132 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3133 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3134 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3135 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3136 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3137 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3138 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3139 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3140 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3141 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3142 c->nz = 0; 3143 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3144 Ccsr->values = new THRUSTARRAY(c->nz); 3145 goto finalizesym; 3146 } 3147 3148 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3149 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3150 Acsr = (CsrMatrix *)Amat->mat; 3151 if (!biscompressed) { 3152 Bcsr = (CsrMatrix *)Bmat->mat; 3153 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3154 BmatSpDescr = Bmat->matDescr; 3155 #endif 3156 } else { /* we need to use row offsets for the full matrix */ 3157 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3158 Bcsr = new CsrMatrix; 3159 Bcsr->num_rows = B->rmap->n; 3160 Bcsr->num_cols = cBcsr->num_cols; 3161 Bcsr->num_entries = cBcsr->num_entries; 3162 Bcsr->column_indices = cBcsr->column_indices; 3163 Bcsr->values = cBcsr->values; 3164 if (!Bcusp->rowoffsets_gpu) { 3165 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3166 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3167 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3168 } 3169 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3170 mmdata->Bcsr = Bcsr; 3171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3172 if (Bcsr->num_rows && Bcsr->num_cols) { 3173 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3174 PetscCallCUSPARSE(stat); 3175 } 3176 BmatSpDescr = mmdata->matSpBDescr; 3177 #endif 3178 } 3179 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3180 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3181 /* precompute flops count */ 3182 if (ptype == MATPRODUCT_AB) { 3183 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3184 const PetscInt st = a->i[i]; 3185 const PetscInt en = a->i[i + 1]; 3186 for (j = st; j < en; j++) { 3187 const PetscInt brow = a->j[j]; 3188 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3189 } 3190 } 3191 } else if (ptype == MATPRODUCT_AtB) { 3192 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3193 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3194 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3195 flops += (2. * anzi) * bnzi; 3196 } 3197 } else { /* TODO */ 3198 flops = 0.; 3199 } 3200 3201 mmdata->flops = flops; 3202 PetscCall(PetscLogGpuTimeBegin()); 3203 3204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3205 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3206 // cuda-12.2 requires non-null csrRowOffsets 3207 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3208 PetscCallCUSPARSE(stat); 3209 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3210 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3211 { 3212 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3213 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3214 */ 3215 void *dBuffer1 = NULL; 3216 void *dBuffer2 = NULL; 3217 void *dBuffer3 = NULL; 3218 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3219 size_t bufferSize1 = 0; 3220 size_t bufferSize2 = 0; 3221 size_t bufferSize3 = 0; 3222 size_t bufferSize4 = 0; 3223 size_t bufferSize5 = 0; 3224 3225 /* ask bufferSize1 bytes for external memory */ 3226 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3227 PetscCallCUSPARSE(stat); 3228 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3229 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3230 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3231 PetscCallCUSPARSE(stat); 3232 3233 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3234 PetscCallCUSPARSE(stat); 3235 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3236 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3237 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3238 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3239 PetscCallCUSPARSE(stat); 3240 PetscCallCUDA(cudaFree(dBuffer1)); 3241 PetscCallCUDA(cudaFree(dBuffer2)); 3242 3243 /* get matrix C non-zero entries C_nnz1 */ 3244 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3245 c->nz = (PetscInt)C_nnz1; 3246 /* allocate matrix C */ 3247 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3248 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3249 Ccsr->values = new THRUSTARRAY(c->nz); 3250 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3251 /* update matC with the new pointers */ 3252 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3253 PetscCallCUSPARSE(stat); 3254 3255 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3256 PetscCallCUSPARSE(stat); 3257 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3258 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3259 PetscCallCUSPARSE(stat); 3260 PetscCallCUDA(cudaFree(dBuffer3)); 3261 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3262 PetscCallCUSPARSE(stat); 3263 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3264 } 3265 #else 3266 size_t bufSize2; 3267 /* ask bufferSize bytes for external memory */ 3268 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3269 PetscCallCUSPARSE(stat); 3270 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3271 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3272 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3273 PetscCallCUSPARSE(stat); 3274 /* ask bufferSize again bytes for external memory */ 3275 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3276 PetscCallCUSPARSE(stat); 3277 /* The CUSPARSE documentation is not clear, nor the API 3278 We need both buffers to perform the operations properly! 3279 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3280 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3281 is stored in the descriptor! What a messy API... */ 3282 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3283 /* compute the intermediate product of A * B */ 3284 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3285 PetscCallCUSPARSE(stat); 3286 /* get matrix C non-zero entries C_nnz1 */ 3287 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3288 c->nz = (PetscInt)C_nnz1; 3289 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3290 mmdata->mmBufferSize / 1024)); 3291 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3292 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3293 Ccsr->values = new THRUSTARRAY(c->nz); 3294 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3295 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3296 PetscCallCUSPARSE(stat); 3297 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3298 PetscCallCUSPARSE(stat); 3299 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3300 #else 3301 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3302 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3303 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3304 PetscCallCUSPARSE(stat); 3305 c->nz = cnz; 3306 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3307 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3308 Ccsr->values = new THRUSTARRAY(c->nz); 3309 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3310 3311 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3312 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3313 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3314 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3315 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3316 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3317 PetscCallCUSPARSE(stat); 3318 #endif 3319 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3320 PetscCall(PetscLogGpuTimeEnd()); 3321 finalizesym: 3322 c->free_a = PETSC_TRUE; 3323 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3324 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3325 c->free_ij = PETSC_TRUE; 3326 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3327 PetscInt *d_i = c->i; 3328 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3329 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3330 ii = *Ccsr->row_offsets; 3331 jj = *Ccsr->column_indices; 3332 if (ciscompressed) d_i = c->compressedrow.i; 3333 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3334 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3335 } else { 3336 PetscInt *d_i = c->i; 3337 if (ciscompressed) d_i = c->compressedrow.i; 3338 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3339 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3340 } 3341 if (ciscompressed) { /* need to expand host row offsets */ 3342 PetscInt r = 0; 3343 c->i[0] = 0; 3344 for (k = 0; k < c->compressedrow.nrows; k++) { 3345 const PetscInt next = c->compressedrow.rindex[k]; 3346 const PetscInt old = c->compressedrow.i[k]; 3347 for (; r < next; r++) c->i[r + 1] = old; 3348 } 3349 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3350 } 3351 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3352 PetscCall(PetscMalloc1(m, &c->ilen)); 3353 PetscCall(PetscMalloc1(m, &c->imax)); 3354 c->maxnz = c->nz; 3355 c->nonzerorowcnt = 0; 3356 c->rmax = 0; 3357 for (k = 0; k < m; k++) { 3358 const PetscInt nn = c->i[k + 1] - c->i[k]; 3359 c->ilen[k] = c->imax[k] = nn; 3360 c->nonzerorowcnt += (PetscInt) !!nn; 3361 c->rmax = PetscMax(c->rmax, nn); 3362 } 3363 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3364 PetscCall(PetscMalloc1(c->nz, &c->a)); 3365 Ccsr->num_entries = c->nz; 3366 3367 C->nonzerostate++; 3368 PetscCall(PetscLayoutSetUp(C->rmap)); 3369 PetscCall(PetscLayoutSetUp(C->cmap)); 3370 Ccusp->nonzerostate = C->nonzerostate; 3371 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3372 C->preallocated = PETSC_TRUE; 3373 C->assembled = PETSC_FALSE; 3374 C->was_assembled = PETSC_FALSE; 3375 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3376 mmdata->reusesym = PETSC_TRUE; 3377 C->offloadmask = PETSC_OFFLOAD_GPU; 3378 } 3379 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3380 PetscFunctionReturn(PETSC_SUCCESS); 3381 } 3382 3383 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3384 3385 /* handles sparse or dense B */ 3386 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3387 { 3388 Mat_Product *product = mat->product; 3389 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3390 3391 PetscFunctionBegin; 3392 MatCheckProduct(mat, 1); 3393 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3394 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3395 if (product->type == MATPRODUCT_ABC) { 3396 Ciscusp = PETSC_FALSE; 3397 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3398 } 3399 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3400 PetscBool usecpu = PETSC_FALSE; 3401 switch (product->type) { 3402 case MATPRODUCT_AB: 3403 if (product->api_user) { 3404 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3405 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3406 PetscOptionsEnd(); 3407 } else { 3408 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3409 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3410 PetscOptionsEnd(); 3411 } 3412 break; 3413 case MATPRODUCT_AtB: 3414 if (product->api_user) { 3415 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3416 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3417 PetscOptionsEnd(); 3418 } else { 3419 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3420 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3421 PetscOptionsEnd(); 3422 } 3423 break; 3424 case MATPRODUCT_PtAP: 3425 if (product->api_user) { 3426 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3427 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3428 PetscOptionsEnd(); 3429 } else { 3430 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3431 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3432 PetscOptionsEnd(); 3433 } 3434 break; 3435 case MATPRODUCT_RARt: 3436 if (product->api_user) { 3437 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3438 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3439 PetscOptionsEnd(); 3440 } else { 3441 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3442 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3443 PetscOptionsEnd(); 3444 } 3445 break; 3446 case MATPRODUCT_ABC: 3447 if (product->api_user) { 3448 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3449 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3450 PetscOptionsEnd(); 3451 } else { 3452 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3453 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3454 PetscOptionsEnd(); 3455 } 3456 break; 3457 default: 3458 break; 3459 } 3460 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3461 } 3462 /* dispatch */ 3463 if (isdense) { 3464 switch (product->type) { 3465 case MATPRODUCT_AB: 3466 case MATPRODUCT_AtB: 3467 case MATPRODUCT_ABt: 3468 case MATPRODUCT_PtAP: 3469 case MATPRODUCT_RARt: 3470 if (product->A->boundtocpu) { 3471 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3472 } else { 3473 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3474 } 3475 break; 3476 case MATPRODUCT_ABC: 3477 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3478 break; 3479 default: 3480 break; 3481 } 3482 } else if (Biscusp && Ciscusp) { 3483 switch (product->type) { 3484 case MATPRODUCT_AB: 3485 case MATPRODUCT_AtB: 3486 case MATPRODUCT_ABt: 3487 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3488 break; 3489 case MATPRODUCT_PtAP: 3490 case MATPRODUCT_RARt: 3491 case MATPRODUCT_ABC: 3492 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3493 break; 3494 default: 3495 break; 3496 } 3497 } else { /* fallback for AIJ */ 3498 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3499 } 3500 PetscFunctionReturn(PETSC_SUCCESS); 3501 } 3502 3503 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3504 { 3505 PetscFunctionBegin; 3506 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3507 PetscFunctionReturn(PETSC_SUCCESS); 3508 } 3509 3510 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3511 { 3512 PetscFunctionBegin; 3513 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3514 PetscFunctionReturn(PETSC_SUCCESS); 3515 } 3516 3517 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3518 { 3519 PetscFunctionBegin; 3520 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3521 PetscFunctionReturn(PETSC_SUCCESS); 3522 } 3523 3524 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3525 { 3526 PetscFunctionBegin; 3527 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3528 PetscFunctionReturn(PETSC_SUCCESS); 3529 } 3530 3531 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3532 { 3533 PetscFunctionBegin; 3534 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3535 PetscFunctionReturn(PETSC_SUCCESS); 3536 } 3537 3538 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3539 { 3540 int i = blockIdx.x * blockDim.x + threadIdx.x; 3541 if (i < n) y[idx[i]] += x[i]; 3542 } 3543 3544 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3545 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3546 { 3547 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3548 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3549 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3550 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3551 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3552 PetscBool compressed; 3553 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3554 PetscInt nx, ny; 3555 #endif 3556 3557 PetscFunctionBegin; 3558 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3559 if (!a->nz) { 3560 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3561 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3562 PetscFunctionReturn(PETSC_SUCCESS); 3563 } 3564 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3565 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3566 if (!trans) { 3567 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3568 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3569 } else { 3570 if (herm || !A->form_explicit_transpose) { 3571 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3572 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3573 } else { 3574 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3575 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3576 } 3577 } 3578 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3579 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3580 3581 try { 3582 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3583 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3584 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3585 3586 PetscCall(PetscLogGpuTimeBegin()); 3587 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3588 /* z = A x + beta y. 3589 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3590 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3591 */ 3592 xptr = xarray; 3593 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3594 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3595 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3596 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3597 allocated to accommodate different uses. So we get the length info directly from mat. 3598 */ 3599 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3600 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3601 nx = mat->num_cols; // since y = Ax 3602 ny = mat->num_rows; 3603 } 3604 #endif 3605 } else { 3606 /* z = A^T x + beta y 3607 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3608 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3609 */ 3610 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3611 dptr = zarray; 3612 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3613 if (compressed) { /* Scatter x to work vector */ 3614 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3615 3616 thrust::for_each( 3617 #if PetscDefined(HAVE_THRUST_ASYNC) 3618 thrust::cuda::par.on(PetscDefaultCudaStream), 3619 #endif 3620 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3621 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3622 } 3623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3624 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3625 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3626 nx = mat->num_rows; // since y = A^T x 3627 ny = mat->num_cols; 3628 } 3629 #endif 3630 } 3631 3632 /* csr_spmv does y = alpha op(A) x + beta y */ 3633 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3634 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3635 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3636 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3637 #else 3638 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3639 #endif 3640 3641 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3642 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3643 if (!matDescr) { 3644 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3645 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3646 } 3647 #endif 3648 3649 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3650 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3651 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3652 PetscCallCUSPARSE( 3653 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3654 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3655 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3656 PetscCallCUSPARSE( 3657 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3658 #endif 3659 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3660 } else { 3661 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3662 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3663 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3664 } 3665 3666 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3667 #else 3668 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3669 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3670 #endif 3671 } else { 3672 if (cusparsestruct->nrows) { 3673 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3674 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3675 #else 3676 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3677 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3678 #endif 3679 } 3680 } 3681 PetscCall(PetscLogGpuTimeEnd()); 3682 3683 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3684 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3685 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3686 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3687 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3688 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3689 } 3690 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3691 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3692 } 3693 3694 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3695 if (compressed) { 3696 PetscCall(PetscLogGpuTimeBegin()); 3697 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3698 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3699 prevent that. So I just add a ScatterAdd kernel. 3700 */ 3701 #if 0 3702 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3703 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3704 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3705 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3706 VecCUDAPlusEquals()); 3707 #else 3708 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3709 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3710 #endif 3711 PetscCall(PetscLogGpuTimeEnd()); 3712 } 3713 } else { 3714 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3715 } 3716 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3717 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3718 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3719 } catch (char *ex) { 3720 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3721 } 3722 if (yy) { 3723 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3724 } else { 3725 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3726 } 3727 PetscFunctionReturn(PETSC_SUCCESS); 3728 } 3729 3730 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3731 { 3732 PetscFunctionBegin; 3733 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3734 PetscFunctionReturn(PETSC_SUCCESS); 3735 } 3736 3737 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3738 { 3739 PetscFunctionBegin; 3740 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3741 PetscFunctionReturn(PETSC_SUCCESS); 3742 } 3743 3744 /*@ 3745 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3746 (the default parallel PETSc format). 3747 3748 Collective 3749 3750 Input Parameters: 3751 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3752 . m - number of rows 3753 . n - number of columns 3754 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3755 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3756 3757 Output Parameter: 3758 . A - the matrix 3759 3760 Level: intermediate 3761 3762 Notes: 3763 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3764 calculations. For good matrix assembly performance the user should preallocate the matrix 3765 storage by setting the parameter `nz` (or the array `nnz`). 3766 3767 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3768 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3769 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3770 3771 The AIJ format, also called 3772 compressed row storage, is fully compatible with standard Fortran 3773 storage. That is, the stored row and column indices can begin at 3774 either one (as in Fortran) or zero. 3775 3776 Specify the preallocated storage with either nz or nnz (not both). 3777 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3778 allocation. 3779 3780 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 3781 @*/ 3782 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3783 { 3784 PetscFunctionBegin; 3785 PetscCall(MatCreate(comm, A)); 3786 PetscCall(MatSetSizes(*A, m, n, m, n)); 3787 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3788 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3789 PetscFunctionReturn(PETSC_SUCCESS); 3790 } 3791 3792 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3793 { 3794 PetscFunctionBegin; 3795 if (A->factortype == MAT_FACTOR_NONE) { 3796 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3797 } else { 3798 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3799 } 3800 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3801 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3802 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3803 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3804 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3805 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3806 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3807 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3808 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3809 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3810 PetscCall(MatDestroy_SeqAIJ(A)); 3811 PetscFunctionReturn(PETSC_SUCCESS); 3812 } 3813 3814 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3815 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3816 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3817 { 3818 PetscFunctionBegin; 3819 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3820 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3821 PetscFunctionReturn(PETSC_SUCCESS); 3822 } 3823 3824 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3825 { 3826 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3827 Mat_SeqAIJCUSPARSE *cy; 3828 Mat_SeqAIJCUSPARSE *cx; 3829 PetscScalar *ay; 3830 const PetscScalar *ax; 3831 CsrMatrix *csry, *csrx; 3832 3833 PetscFunctionBegin; 3834 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3835 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3836 if (X->ops->axpy != Y->ops->axpy) { 3837 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3838 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3839 PetscFunctionReturn(PETSC_SUCCESS); 3840 } 3841 /* if we are here, it means both matrices are bound to GPU */ 3842 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3843 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3844 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3845 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3846 csry = (CsrMatrix *)cy->mat->mat; 3847 csrx = (CsrMatrix *)cx->mat->mat; 3848 /* see if we can turn this into a cublas axpy */ 3849 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3850 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3851 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3852 if (eq) str = SAME_NONZERO_PATTERN; 3853 } 3854 /* spgeam is buggy with one column */ 3855 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3856 3857 if (str == SUBSET_NONZERO_PATTERN) { 3858 PetscScalar b = 1.0; 3859 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3860 size_t bufferSize; 3861 void *buffer; 3862 #endif 3863 3864 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3865 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3866 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3867 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3868 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3869 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3870 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3871 PetscCall(PetscLogGpuTimeBegin()); 3872 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3873 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3874 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3875 PetscCall(PetscLogGpuTimeEnd()); 3876 PetscCallCUDA(cudaFree(buffer)); 3877 #else 3878 PetscCall(PetscLogGpuTimeBegin()); 3879 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3880 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3881 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3882 PetscCall(PetscLogGpuTimeEnd()); 3883 #endif 3884 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3885 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3886 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3887 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3888 } else if (str == SAME_NONZERO_PATTERN) { 3889 cublasHandle_t cublasv2handle; 3890 PetscBLASInt one = 1, bnz = 1; 3891 3892 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3893 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3894 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3895 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3896 PetscCall(PetscLogGpuTimeBegin()); 3897 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3898 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3899 PetscCall(PetscLogGpuTimeEnd()); 3900 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3901 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3902 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3903 } else { 3904 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3905 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3906 } 3907 PetscFunctionReturn(PETSC_SUCCESS); 3908 } 3909 3910 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3911 { 3912 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3913 PetscScalar *ay; 3914 cublasHandle_t cublasv2handle; 3915 PetscBLASInt one = 1, bnz = 1; 3916 3917 PetscFunctionBegin; 3918 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3919 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3920 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3921 PetscCall(PetscLogGpuTimeBegin()); 3922 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3923 PetscCall(PetscLogGpuFlops(bnz)); 3924 PetscCall(PetscLogGpuTimeEnd()); 3925 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3926 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3927 PetscFunctionReturn(PETSC_SUCCESS); 3928 } 3929 3930 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3931 { 3932 PetscBool both = PETSC_FALSE; 3933 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3934 3935 PetscFunctionBegin; 3936 if (A->factortype == MAT_FACTOR_NONE) { 3937 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3938 if (spptr->mat) { 3939 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3940 if (matrix->values) { 3941 both = PETSC_TRUE; 3942 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3943 } 3944 } 3945 if (spptr->matTranspose) { 3946 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3947 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3948 } 3949 } 3950 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3951 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3952 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3953 else A->offloadmask = PETSC_OFFLOAD_CPU; 3954 PetscFunctionReturn(PETSC_SUCCESS); 3955 } 3956 3957 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3958 { 3959 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3960 3961 PetscFunctionBegin; 3962 if (A->factortype != MAT_FACTOR_NONE) { 3963 A->boundtocpu = flg; 3964 PetscFunctionReturn(PETSC_SUCCESS); 3965 } 3966 if (flg) { 3967 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3968 3969 A->ops->scale = MatScale_SeqAIJ; 3970 A->ops->axpy = MatAXPY_SeqAIJ; 3971 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3972 A->ops->mult = MatMult_SeqAIJ; 3973 A->ops->multadd = MatMultAdd_SeqAIJ; 3974 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3975 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3976 A->ops->multhermitiantranspose = NULL; 3977 A->ops->multhermitiantransposeadd = NULL; 3978 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3979 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3980 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3981 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3982 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3983 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3984 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3985 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3986 } else { 3987 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3988 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3989 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3990 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3991 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3992 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3993 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3994 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3995 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3996 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3997 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3998 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3999 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4000 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4001 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4002 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4003 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4004 4005 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4006 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4007 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4008 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4009 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4010 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4011 } 4012 A->boundtocpu = flg; 4013 if (flg && a->inode.size) { 4014 a->inode.use = PETSC_TRUE; 4015 } else { 4016 a->inode.use = PETSC_FALSE; 4017 } 4018 PetscFunctionReturn(PETSC_SUCCESS); 4019 } 4020 4021 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4022 { 4023 Mat B; 4024 4025 PetscFunctionBegin; 4026 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4027 if (reuse == MAT_INITIAL_MATRIX) { 4028 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4029 } else if (reuse == MAT_REUSE_MATRIX) { 4030 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4031 } 4032 B = *newmat; 4033 4034 PetscCall(PetscFree(B->defaultvectype)); 4035 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4036 4037 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4038 if (B->factortype == MAT_FACTOR_NONE) { 4039 Mat_SeqAIJCUSPARSE *spptr; 4040 PetscCall(PetscNew(&spptr)); 4041 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4042 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4043 spptr->format = MAT_CUSPARSE_CSR; 4044 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4045 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4046 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4047 #else 4048 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4049 #endif 4050 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4051 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4052 #endif 4053 B->spptr = spptr; 4054 } else { 4055 Mat_SeqAIJCUSPARSETriFactors *spptr; 4056 4057 PetscCall(PetscNew(&spptr)); 4058 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4059 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4060 B->spptr = spptr; 4061 } 4062 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4063 } 4064 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4065 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4066 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4067 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4068 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4069 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4070 4071 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4072 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4073 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4074 #if defined(PETSC_HAVE_HYPRE) 4075 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4076 #endif 4077 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4078 PetscFunctionReturn(PETSC_SUCCESS); 4079 } 4080 4081 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4082 { 4083 PetscFunctionBegin; 4084 PetscCall(MatCreate_SeqAIJ(B)); 4085 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4086 PetscFunctionReturn(PETSC_SUCCESS); 4087 } 4088 4089 /*MC 4090 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4091 4092 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 4093 CSR, ELL, or Hybrid format. 4094 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4095 4096 Options Database Keys: 4097 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4098 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4099 Other options include ell (ellpack) or hyb (hybrid). 4100 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4101 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4102 4103 Level: beginner 4104 4105 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4106 M*/ 4107 4108 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4109 { 4110 PetscFunctionBegin; 4111 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4112 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4113 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4114 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4115 PetscFunctionReturn(PETSC_SUCCESS); 4116 } 4117 4118 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4119 { 4120 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4121 4122 PetscFunctionBegin; 4123 if (cusp) { 4124 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4125 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4126 delete cusp->workVector; 4127 delete cusp->rowoffsets_gpu; 4128 delete cusp->csr2csc_i; 4129 delete cusp->coords; 4130 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4131 PetscCall(PetscFree(mat->spptr)); 4132 } 4133 PetscFunctionReturn(PETSC_SUCCESS); 4134 } 4135 4136 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4137 { 4138 PetscFunctionBegin; 4139 if (*mat) { 4140 delete (*mat)->values; 4141 delete (*mat)->column_indices; 4142 delete (*mat)->row_offsets; 4143 delete *mat; 4144 *mat = 0; 4145 } 4146 PetscFunctionReturn(PETSC_SUCCESS); 4147 } 4148 4149 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4150 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4151 { 4152 PetscFunctionBegin; 4153 if (*trifactor) { 4154 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4155 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4156 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4157 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4158 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4160 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4161 #endif 4162 PetscCall(PetscFree(*trifactor)); 4163 } 4164 PetscFunctionReturn(PETSC_SUCCESS); 4165 } 4166 #endif 4167 4168 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4169 { 4170 CsrMatrix *mat; 4171 4172 PetscFunctionBegin; 4173 if (*matstruct) { 4174 if ((*matstruct)->mat) { 4175 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4176 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4177 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4178 #else 4179 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4180 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4181 #endif 4182 } else { 4183 mat = (CsrMatrix *)(*matstruct)->mat; 4184 PetscCall(CsrMatrix_Destroy(&mat)); 4185 } 4186 } 4187 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4188 delete (*matstruct)->cprowIndices; 4189 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4190 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4191 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4192 4193 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4194 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4195 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4196 4197 for (int i = 0; i < 3; i++) { 4198 if (mdata->cuSpMV[i].initialized) { 4199 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4200 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4201 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4202 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4203 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4204 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4205 #endif 4206 } 4207 } 4208 #endif 4209 delete *matstruct; 4210 *matstruct = NULL; 4211 } 4212 PetscFunctionReturn(PETSC_SUCCESS); 4213 } 4214 4215 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4216 { 4217 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4218 4219 PetscFunctionBegin; 4220 if (fs) { 4221 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4222 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4223 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4224 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4225 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4226 delete fs->workVector; 4227 fs->workVector = NULL; 4228 #endif 4229 delete fs->rpermIndices; 4230 delete fs->cpermIndices; 4231 fs->rpermIndices = NULL; 4232 fs->cpermIndices = NULL; 4233 fs->init_dev_prop = PETSC_FALSE; 4234 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4235 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4236 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4237 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4238 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4239 PetscCallCUDA(cudaFree(fs->csrVal)); 4240 PetscCallCUDA(cudaFree(fs->diag)); 4241 PetscCallCUDA(cudaFree(fs->X)); 4242 PetscCallCUDA(cudaFree(fs->Y)); 4243 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4244 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4245 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4246 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4247 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4248 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4249 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4250 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4251 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4252 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4253 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4254 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4255 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4256 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4257 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4258 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4259 PetscCall(PetscFree(fs->csrRowPtr_h)); 4260 PetscCall(PetscFree(fs->csrVal_h)); 4261 PetscCall(PetscFree(fs->diag_h)); 4262 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4263 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4264 #endif 4265 } 4266 PetscFunctionReturn(PETSC_SUCCESS); 4267 } 4268 4269 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4270 { 4271 PetscFunctionBegin; 4272 if (*trifactors) { 4273 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4274 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4275 PetscCall(PetscFree(*trifactors)); 4276 } 4277 PetscFunctionReturn(PETSC_SUCCESS); 4278 } 4279 4280 struct IJCompare { 4281 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4282 { 4283 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4284 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4285 return false; 4286 } 4287 }; 4288 4289 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4290 { 4291 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4292 4293 PetscFunctionBegin; 4294 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4295 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4296 if (destroy) { 4297 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4298 delete cusp->csr2csc_i; 4299 cusp->csr2csc_i = NULL; 4300 } 4301 A->transupdated = PETSC_FALSE; 4302 PetscFunctionReturn(PETSC_SUCCESS); 4303 } 4304 4305 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data) 4306 { 4307 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data; 4308 4309 PetscFunctionBegin; 4310 PetscCallCUDA(cudaFree(coo->perm)); 4311 PetscCallCUDA(cudaFree(coo->jmap)); 4312 PetscCall(PetscFree(coo)); 4313 PetscFunctionReturn(PETSC_SUCCESS); 4314 } 4315 4316 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4317 { 4318 PetscBool dev_ij = PETSC_FALSE; 4319 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4320 PetscInt *i, *j; 4321 PetscContainer container_h, container_d; 4322 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4323 4324 PetscFunctionBegin; 4325 // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter 4326 PetscCall(PetscGetMemType(coo_i, &mtype)); 4327 if (PetscMemTypeDevice(mtype)) { 4328 dev_ij = PETSC_TRUE; 4329 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4330 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4331 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4332 } else { 4333 i = coo_i; 4334 j = coo_j; 4335 } 4336 4337 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4338 if (dev_ij) PetscCall(PetscFree2(i, j)); 4339 mat->offloadmask = PETSC_OFFLOAD_CPU; 4340 // Create the GPU memory 4341 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4342 4343 // Copy the COO struct to device 4344 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4345 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4346 PetscCall(PetscMalloc1(1, &coo_d)); 4347 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4348 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4349 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4350 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4351 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4352 4353 // Put the COO struct in a container and then attach that to the matrix 4354 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 4355 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 4356 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4357 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 4358 PetscCall(PetscContainerDestroy(&container_d)); 4359 PetscFunctionReturn(PETSC_SUCCESS); 4360 } 4361 4362 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4363 { 4364 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4365 const PetscCount grid_size = gridDim.x * blockDim.x; 4366 for (; i < nnz; i += grid_size) { 4367 PetscScalar sum = 0.0; 4368 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4369 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4370 } 4371 } 4372 4373 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4374 { 4375 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4376 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4377 PetscCount Annz = seq->nz; 4378 PetscMemType memtype; 4379 const PetscScalar *v1 = v; 4380 PetscScalar *Aa; 4381 PetscContainer container; 4382 MatCOOStruct_SeqAIJ *coo; 4383 4384 PetscFunctionBegin; 4385 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4386 4387 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4388 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4389 4390 PetscCall(PetscGetMemType(v, &memtype)); 4391 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4392 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4393 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4394 } 4395 4396 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4397 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4398 4399 PetscCall(PetscLogGpuTimeBegin()); 4400 if (Annz) { 4401 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4402 PetscCallCUDA(cudaPeekAtLastError()); 4403 } 4404 PetscCall(PetscLogGpuTimeEnd()); 4405 4406 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4407 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4408 4409 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4410 PetscFunctionReturn(PETSC_SUCCESS); 4411 } 4412 4413 /*@C 4414 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4415 4416 Not Collective 4417 4418 Input Parameters: 4419 + A - the matrix 4420 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4421 4422 Output Parameters: 4423 + i - the CSR row pointers 4424 - j - the CSR column indices 4425 4426 Level: developer 4427 4428 Note: 4429 When compressed is true, the CSR structure does not contain empty rows 4430 4431 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4432 @*/ 4433 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4434 { 4435 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4436 CsrMatrix *csr; 4437 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4438 4439 PetscFunctionBegin; 4440 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4441 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4442 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4443 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4444 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4445 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4446 csr = (CsrMatrix *)cusp->mat->mat; 4447 if (i) { 4448 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4449 if (!cusp->rowoffsets_gpu) { 4450 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4451 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4452 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4453 } 4454 *i = cusp->rowoffsets_gpu->data().get(); 4455 } else *i = csr->row_offsets->data().get(); 4456 } 4457 if (j) *j = csr->column_indices->data().get(); 4458 PetscFunctionReturn(PETSC_SUCCESS); 4459 } 4460 4461 /*@C 4462 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4463 4464 Not Collective 4465 4466 Input Parameters: 4467 + A - the matrix 4468 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4469 . i - the CSR row pointers 4470 - j - the CSR column indices 4471 4472 Level: developer 4473 4474 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4475 @*/ 4476 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4477 { 4478 PetscFunctionBegin; 4479 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4480 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4481 if (i) *i = NULL; 4482 if (j) *j = NULL; 4483 (void)compressed; 4484 PetscFunctionReturn(PETSC_SUCCESS); 4485 } 4486 4487 /*@C 4488 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4489 4490 Not Collective 4491 4492 Input Parameter: 4493 . A - a `MATSEQAIJCUSPARSE` matrix 4494 4495 Output Parameter: 4496 . a - pointer to the device data 4497 4498 Level: developer 4499 4500 Note: 4501 May trigger host-device copies if up-to-date matrix data is on host 4502 4503 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4504 @*/ 4505 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4506 { 4507 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4508 CsrMatrix *csr; 4509 4510 PetscFunctionBegin; 4511 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4512 PetscAssertPointer(a, 2); 4513 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4514 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4515 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4516 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4517 csr = (CsrMatrix *)cusp->mat->mat; 4518 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4519 *a = csr->values->data().get(); 4520 PetscFunctionReturn(PETSC_SUCCESS); 4521 } 4522 4523 /*@C 4524 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4525 4526 Not Collective 4527 4528 Input Parameters: 4529 + A - a `MATSEQAIJCUSPARSE` matrix 4530 - a - pointer to the device data 4531 4532 Level: developer 4533 4534 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4535 @*/ 4536 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4537 { 4538 PetscFunctionBegin; 4539 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4540 PetscAssertPointer(a, 2); 4541 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4542 *a = NULL; 4543 PetscFunctionReturn(PETSC_SUCCESS); 4544 } 4545 4546 /*@C 4547 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4548 4549 Not Collective 4550 4551 Input Parameter: 4552 . A - a `MATSEQAIJCUSPARSE` matrix 4553 4554 Output Parameter: 4555 . a - pointer to the device data 4556 4557 Level: developer 4558 4559 Note: 4560 May trigger host-device copies if up-to-date matrix data is on host 4561 4562 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4563 @*/ 4564 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4565 { 4566 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4567 CsrMatrix *csr; 4568 4569 PetscFunctionBegin; 4570 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4571 PetscAssertPointer(a, 2); 4572 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4573 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4574 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4575 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4576 csr = (CsrMatrix *)cusp->mat->mat; 4577 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4578 *a = csr->values->data().get(); 4579 A->offloadmask = PETSC_OFFLOAD_GPU; 4580 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4581 PetscFunctionReturn(PETSC_SUCCESS); 4582 } 4583 /*@C 4584 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4585 4586 Not Collective 4587 4588 Input Parameters: 4589 + A - a `MATSEQAIJCUSPARSE` matrix 4590 - a - pointer to the device data 4591 4592 Level: developer 4593 4594 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4595 @*/ 4596 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4597 { 4598 PetscFunctionBegin; 4599 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4600 PetscAssertPointer(a, 2); 4601 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4602 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4603 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4604 *a = NULL; 4605 PetscFunctionReturn(PETSC_SUCCESS); 4606 } 4607 4608 /*@C 4609 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4610 4611 Not Collective 4612 4613 Input Parameter: 4614 . A - a `MATSEQAIJCUSPARSE` matrix 4615 4616 Output Parameter: 4617 . a - pointer to the device data 4618 4619 Level: developer 4620 4621 Note: 4622 Does not trigger host-device copies and flags data validity on the GPU 4623 4624 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4625 @*/ 4626 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4627 { 4628 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4629 CsrMatrix *csr; 4630 4631 PetscFunctionBegin; 4632 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4633 PetscAssertPointer(a, 2); 4634 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4635 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4636 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4637 csr = (CsrMatrix *)cusp->mat->mat; 4638 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4639 *a = csr->values->data().get(); 4640 A->offloadmask = PETSC_OFFLOAD_GPU; 4641 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4642 PetscFunctionReturn(PETSC_SUCCESS); 4643 } 4644 4645 /*@C 4646 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4647 4648 Not Collective 4649 4650 Input Parameters: 4651 + A - a `MATSEQAIJCUSPARSE` matrix 4652 - a - pointer to the device data 4653 4654 Level: developer 4655 4656 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4657 @*/ 4658 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4659 { 4660 PetscFunctionBegin; 4661 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4662 PetscAssertPointer(a, 2); 4663 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4664 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4665 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4666 *a = NULL; 4667 PetscFunctionReturn(PETSC_SUCCESS); 4668 } 4669 4670 struct IJCompare4 { 4671 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4672 { 4673 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4674 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4675 return false; 4676 } 4677 }; 4678 4679 struct Shift { 4680 int _shift; 4681 4682 Shift(int shift) : _shift(shift) { } 4683 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4684 }; 4685 4686 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4687 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4688 { 4689 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4690 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4691 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4692 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4693 PetscInt Annz, Bnnz; 4694 cusparseStatus_t stat; 4695 PetscInt i, m, n, zero = 0; 4696 4697 PetscFunctionBegin; 4698 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4699 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4700 PetscAssertPointer(C, 4); 4701 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4702 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4703 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4704 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4705 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4706 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4707 if (reuse == MAT_INITIAL_MATRIX) { 4708 m = A->rmap->n; 4709 n = A->cmap->n + B->cmap->n; 4710 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4711 PetscCall(MatSetSizes(*C, m, n, m, n)); 4712 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4713 c = (Mat_SeqAIJ *)(*C)->data; 4714 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4715 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4716 Ccsr = new CsrMatrix; 4717 Cmat->cprowIndices = NULL; 4718 c->compressedrow.use = PETSC_FALSE; 4719 c->compressedrow.nrows = 0; 4720 c->compressedrow.i = NULL; 4721 c->compressedrow.rindex = NULL; 4722 Ccusp->workVector = NULL; 4723 Ccusp->nrows = m; 4724 Ccusp->mat = Cmat; 4725 Ccusp->mat->mat = Ccsr; 4726 Ccsr->num_rows = m; 4727 Ccsr->num_cols = n; 4728 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4729 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4730 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4731 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4732 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4733 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4734 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4735 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4736 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4737 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4738 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4739 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4740 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4741 4742 Acsr = (CsrMatrix *)Acusp->mat->mat; 4743 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4744 Annz = (PetscInt)Acsr->column_indices->size(); 4745 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4746 c->nz = Annz + Bnnz; 4747 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4748 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4749 Ccsr->values = new THRUSTARRAY(c->nz); 4750 Ccsr->num_entries = c->nz; 4751 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4752 if (c->nz) { 4753 auto Acoo = new THRUSTINTARRAY32(Annz); 4754 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4755 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4756 THRUSTINTARRAY32 *Aroff, *Broff; 4757 4758 if (a->compressedrow.use) { /* need full row offset */ 4759 if (!Acusp->rowoffsets_gpu) { 4760 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4761 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4762 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4763 } 4764 Aroff = Acusp->rowoffsets_gpu; 4765 } else Aroff = Acsr->row_offsets; 4766 if (b->compressedrow.use) { /* need full row offset */ 4767 if (!Bcusp->rowoffsets_gpu) { 4768 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4769 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4770 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4771 } 4772 Broff = Bcusp->rowoffsets_gpu; 4773 } else Broff = Bcsr->row_offsets; 4774 PetscCall(PetscLogGpuTimeBegin()); 4775 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4776 PetscCallCUSPARSE(stat); 4777 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4778 PetscCallCUSPARSE(stat); 4779 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4780 auto Aperm = thrust::make_constant_iterator(1); 4781 auto Bperm = thrust::make_constant_iterator(0); 4782 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4783 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4784 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4785 #else 4786 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4787 auto Bcib = Bcsr->column_indices->begin(); 4788 auto Bcie = Bcsr->column_indices->end(); 4789 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4790 #endif 4791 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4792 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4793 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4794 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4795 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4796 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4797 auto p1 = Ccusp->coords->begin(); 4798 auto p2 = Ccusp->coords->begin(); 4799 thrust::advance(p2, Annz); 4800 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4801 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4802 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4803 #endif 4804 auto cci = thrust::make_counting_iterator(zero); 4805 auto cce = thrust::make_counting_iterator(c->nz); 4806 #if 0 //Errors on SUMMIT cuda 11.1.0 4807 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4808 #else 4809 auto pred = thrust::identity<int>(); 4810 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4811 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4812 #endif 4813 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4814 PetscCallCUSPARSE(stat); 4815 PetscCall(PetscLogGpuTimeEnd()); 4816 delete wPerm; 4817 delete Acoo; 4818 delete Bcoo; 4819 delete Ccoo; 4820 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4821 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4822 PetscCallCUSPARSE(stat); 4823 #endif 4824 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4825 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4826 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4827 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4828 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4829 CsrMatrix *CcsrT = new CsrMatrix; 4830 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4831 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4832 4833 (*C)->form_explicit_transpose = PETSC_TRUE; 4834 (*C)->transupdated = PETSC_TRUE; 4835 Ccusp->rowoffsets_gpu = NULL; 4836 CmatT->cprowIndices = NULL; 4837 CmatT->mat = CcsrT; 4838 CcsrT->num_rows = n; 4839 CcsrT->num_cols = m; 4840 CcsrT->num_entries = c->nz; 4841 4842 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4843 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4844 CcsrT->values = new THRUSTARRAY(c->nz); 4845 4846 PetscCall(PetscLogGpuTimeBegin()); 4847 auto rT = CcsrT->row_offsets->begin(); 4848 if (AT) { 4849 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4850 thrust::advance(rT, -1); 4851 } 4852 if (BT) { 4853 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4854 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4855 thrust::copy(titb, tite, rT); 4856 } 4857 auto cT = CcsrT->column_indices->begin(); 4858 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4859 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4860 auto vT = CcsrT->values->begin(); 4861 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4862 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4863 PetscCall(PetscLogGpuTimeEnd()); 4864 4865 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4866 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4867 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4868 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4869 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4870 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4871 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4872 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4873 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4874 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4875 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4876 PetscCallCUSPARSE(stat); 4877 #endif 4878 Ccusp->matTranspose = CmatT; 4879 } 4880 } 4881 4882 c->free_a = PETSC_TRUE; 4883 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4884 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4885 c->free_ij = PETSC_TRUE; 4886 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4887 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4888 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4889 ii = *Ccsr->row_offsets; 4890 jj = *Ccsr->column_indices; 4891 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4892 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4893 } else { 4894 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4895 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4896 } 4897 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4898 PetscCall(PetscMalloc1(m, &c->ilen)); 4899 PetscCall(PetscMalloc1(m, &c->imax)); 4900 c->maxnz = c->nz; 4901 c->nonzerorowcnt = 0; 4902 c->rmax = 0; 4903 for (i = 0; i < m; i++) { 4904 const PetscInt nn = c->i[i + 1] - c->i[i]; 4905 c->ilen[i] = c->imax[i] = nn; 4906 c->nonzerorowcnt += (PetscInt) !!nn; 4907 c->rmax = PetscMax(c->rmax, nn); 4908 } 4909 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4910 PetscCall(PetscMalloc1(c->nz, &c->a)); 4911 (*C)->nonzerostate++; 4912 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4913 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4914 Ccusp->nonzerostate = (*C)->nonzerostate; 4915 (*C)->preallocated = PETSC_TRUE; 4916 } else { 4917 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4918 c = (Mat_SeqAIJ *)(*C)->data; 4919 if (c->nz) { 4920 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4921 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4922 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4923 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4924 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4925 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4926 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4927 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4928 Acsr = (CsrMatrix *)Acusp->mat->mat; 4929 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4930 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4931 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4932 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4933 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4934 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4935 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4936 auto pmid = Ccusp->coords->begin(); 4937 thrust::advance(pmid, Acsr->num_entries); 4938 PetscCall(PetscLogGpuTimeBegin()); 4939 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4940 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4941 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4942 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4943 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4944 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4945 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4946 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4947 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4948 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4949 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4950 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4951 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4952 auto vT = CcsrT->values->begin(); 4953 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4954 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4955 (*C)->transupdated = PETSC_TRUE; 4956 } 4957 PetscCall(PetscLogGpuTimeEnd()); 4958 } 4959 } 4960 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4961 (*C)->assembled = PETSC_TRUE; 4962 (*C)->was_assembled = PETSC_FALSE; 4963 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4964 PetscFunctionReturn(PETSC_SUCCESS); 4965 } 4966 4967 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4968 { 4969 bool dmem; 4970 const PetscScalar *av; 4971 4972 PetscFunctionBegin; 4973 dmem = isCudaMem(v); 4974 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4975 if (n && idx) { 4976 THRUSTINTARRAY widx(n); 4977 widx.assign(idx, idx + n); 4978 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4979 4980 THRUSTARRAY *w = NULL; 4981 thrust::device_ptr<PetscScalar> dv; 4982 if (dmem) { 4983 dv = thrust::device_pointer_cast(v); 4984 } else { 4985 w = new THRUSTARRAY(n); 4986 dv = w->data(); 4987 } 4988 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4989 4990 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4991 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4992 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4993 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4994 delete w; 4995 } else { 4996 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4997 } 4998 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4999 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5000 PetscFunctionReturn(PETSC_SUCCESS); 5001 } 5002 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 5003