1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30 31 typedef enum { 32 CUSPARSE_MV_ALG_DEFAULT = 0, 33 CUSPARSE_COOMV_ALG = 1, 34 CUSPARSE_CSRMV_ALG1 = 2, 35 CUSPARSE_CSRMV_ALG2 = 3 36 } cusparseSpMVAlg_t; 37 38 typedef enum { 39 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44 CUSPARSE_SPMM_ALG_DEFAULT = 0, 45 CUSPARSE_SPMM_COO_ALG1 = 1, 46 CUSPARSE_SPMM_COO_ALG2 = 2, 47 CUSPARSE_SPMM_COO_ALG3 = 3, 48 CUSPARSE_SPMM_COO_ALG4 = 5, 49 CUSPARSE_SPMM_CSR_ALG1 = 4, 50 CUSPARSE_SPMM_CSR_ALG2 = 6, 51 } cusparseSpMMAlg_t; 52 53 typedef enum { 54 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 55 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 56 } cusparseCsr2CscAlg_t; 57 */ 58 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61 #endif 62 63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 67 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 73 #endif 74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 84 85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 87 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 88 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 89 90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 92 93 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 94 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 95 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 96 97 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 98 { 99 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 100 101 PetscFunctionBegin; 102 switch (op) { 103 case MAT_CUSPARSE_MULT: 104 cusparsestruct->format = format; 105 break; 106 case MAT_CUSPARSE_ALL: 107 cusparsestruct->format = format; 108 break; 109 default: 110 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 111 } 112 PetscFunctionReturn(PETSC_SUCCESS); 113 } 114 115 /*@ 116 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 117 operation. Only the `MatMult()` operation can use different GPU storage formats 118 119 Not Collective 120 121 Input Parameters: 122 + A - Matrix of type `MATSEQAIJCUSPARSE` 123 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 124 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 125 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 126 127 Level: intermediate 128 129 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 130 @*/ 131 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 132 { 133 PetscFunctionBegin; 134 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 135 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 140 { 141 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 142 143 PetscFunctionBegin; 144 cusparsestruct->use_cpu_solve = use_cpu; 145 PetscFunctionReturn(PETSC_SUCCESS); 146 } 147 148 /*@ 149 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 150 151 Input Parameters: 152 + A - Matrix of type `MATSEQAIJCUSPARSE` 153 - use_cpu - set flag for using the built-in CPU `MatSolve()` 154 155 Level: intermediate 156 157 Note: 158 The cuSparse LU solver currently computes the factors with the built-in CPU method 159 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 160 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 161 162 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 163 @*/ 164 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 165 { 166 PetscFunctionBegin; 167 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 168 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171 172 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 173 { 174 PetscFunctionBegin; 175 switch (op) { 176 case MAT_FORM_EXPLICIT_TRANSPOSE: 177 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 178 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 179 A->form_explicit_transpose = flg; 180 break; 181 default: 182 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 183 break; 184 } 185 PetscFunctionReturn(PETSC_SUCCESS); 186 } 187 188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 189 { 190 MatCUSPARSEStorageFormat format; 191 PetscBool flg; 192 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 193 194 PetscFunctionBegin; 195 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 196 if (A->factortype == MAT_FACTOR_NONE) { 197 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 198 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 199 200 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 201 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 202 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 203 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 205 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 206 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 207 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 208 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 209 #else 210 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 211 #endif 212 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 213 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 214 215 PetscCall( 216 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 217 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 218 #endif 219 } 220 PetscOptionsHeadEnd(); 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 225 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 226 { 227 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 228 PetscInt m = A->rmap->n; 229 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 230 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 231 const MatScalar *Aa = a->a; 232 PetscInt *Mi, *Mj, Mnz; 233 PetscScalar *Ma; 234 235 PetscFunctionBegin; 236 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 237 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 238 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 239 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 240 PetscCall(PetscMalloc1(m + 1, &Mi)); 241 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 242 PetscCall(PetscMalloc1(Mnz, &Ma)); 243 Mi[0] = 0; 244 for (PetscInt i = 0; i < m; i++) { 245 PetscInt llen = Ai[i + 1] - Ai[i]; 246 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 247 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 248 Mj[Mi[i] + llen] = i; // diagonal entry 249 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 250 Mi[i + 1] = Mi[i] + llen + ulen; 251 } 252 // Copy M (L,U) from host to device 253 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 254 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 255 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 256 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 257 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 258 259 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 260 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 261 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 262 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 263 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 264 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 265 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 266 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 267 268 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 269 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 270 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 271 272 fillMode = CUSPARSE_FILL_MODE_UPPER; 273 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 274 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 275 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 276 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 277 278 // Allocate work vectors in SpSv 279 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 280 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 281 282 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 283 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 284 285 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 286 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 287 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 288 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 289 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 290 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 291 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 292 293 // Record for reuse 294 fs->csrRowPtr_h = Mi; 295 fs->csrVal_h = Ma; 296 PetscCall(PetscFree(Mj)); 297 } 298 // Copy the value 299 Mi = fs->csrRowPtr_h; 300 Ma = fs->csrVal_h; 301 Mnz = Mi[m]; 302 for (PetscInt i = 0; i < m; i++) { 303 PetscInt llen = Ai[i + 1] - Ai[i]; 304 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 305 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 306 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 307 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 308 } 309 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 310 311 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 312 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 313 314 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 315 316 // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve 317 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 318 } 319 PetscFunctionReturn(PETSC_SUCCESS); 320 } 321 #else 322 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 323 { 324 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 325 PetscInt n = A->rmap->n; 326 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 327 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 328 const PetscInt *ai = a->i, *aj = a->j, *vi; 329 const MatScalar *aa = a->a, *v; 330 PetscInt *AiLo, *AjLo; 331 PetscInt i, nz, nzLower, offset, rowOffset; 332 333 PetscFunctionBegin; 334 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 335 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 336 try { 337 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 338 nzLower = n + ai[n] - ai[1]; 339 if (!loTriFactor) { 340 PetscScalar *AALo; 341 342 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 343 344 /* Allocate Space for the lower triangular matrix */ 345 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 346 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 347 348 /* Fill the lower triangular matrix */ 349 AiLo[0] = (PetscInt)0; 350 AiLo[n] = nzLower; 351 AjLo[0] = (PetscInt)0; 352 AALo[0] = (MatScalar)1.0; 353 v = aa; 354 vi = aj; 355 offset = 1; 356 rowOffset = 1; 357 for (i = 1; i < n; i++) { 358 nz = ai[i + 1] - ai[i]; 359 /* additional 1 for the term on the diagonal */ 360 AiLo[i] = rowOffset; 361 rowOffset += nz + 1; 362 363 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 364 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 365 366 offset += nz; 367 AjLo[offset] = (PetscInt)i; 368 AALo[offset] = (MatScalar)1.0; 369 offset += 1; 370 371 v += nz; 372 vi += nz; 373 } 374 375 /* allocate space for the triangular factor information */ 376 PetscCall(PetscNew(&loTriFactor)); 377 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 378 /* Create the matrix description */ 379 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 380 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 381 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 382 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 383 #else 384 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 385 #endif 386 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 387 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 388 389 /* set the operation */ 390 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 391 392 /* set the matrix */ 393 loTriFactor->csrMat = new CsrMatrix; 394 loTriFactor->csrMat->num_rows = n; 395 loTriFactor->csrMat->num_cols = n; 396 loTriFactor->csrMat->num_entries = nzLower; 397 398 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 399 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 400 401 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 402 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 403 404 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 405 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 406 407 /* Create the solve analysis information */ 408 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 409 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 410 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 411 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 412 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 413 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 414 #endif 415 416 /* perform the solve analysis */ 417 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 418 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 419 PetscCallCUDA(WaitForCUDA()); 420 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 421 422 /* assign the pointer */ 423 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 424 loTriFactor->AA_h = AALo; 425 PetscCallCUDA(cudaFreeHost(AiLo)); 426 PetscCallCUDA(cudaFreeHost(AjLo)); 427 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 428 } else { /* update values only */ 429 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 430 /* Fill the lower triangular matrix */ 431 loTriFactor->AA_h[0] = 1.0; 432 v = aa; 433 vi = aj; 434 offset = 1; 435 for (i = 1; i < n; i++) { 436 nz = ai[i + 1] - ai[i]; 437 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 438 offset += nz; 439 loTriFactor->AA_h[offset] = 1.0; 440 offset += 1; 441 v += nz; 442 } 443 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 444 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 445 } 446 } catch (char *ex) { 447 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 448 } 449 } 450 PetscFunctionReturn(PETSC_SUCCESS); 451 } 452 453 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 454 { 455 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 456 PetscInt n = A->rmap->n; 457 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 458 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 459 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 460 const MatScalar *aa = a->a, *v; 461 PetscInt *AiUp, *AjUp; 462 PetscInt i, nz, nzUpper, offset; 463 464 PetscFunctionBegin; 465 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 466 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 467 try { 468 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 469 nzUpper = adiag[0] - adiag[n]; 470 if (!upTriFactor) { 471 PetscScalar *AAUp; 472 473 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 474 475 /* Allocate Space for the upper triangular matrix */ 476 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 477 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 478 479 /* Fill the upper triangular matrix */ 480 AiUp[0] = (PetscInt)0; 481 AiUp[n] = nzUpper; 482 offset = nzUpper; 483 for (i = n - 1; i >= 0; i--) { 484 v = aa + adiag[i + 1] + 1; 485 vi = aj + adiag[i + 1] + 1; 486 487 /* number of elements NOT on the diagonal */ 488 nz = adiag[i] - adiag[i + 1] - 1; 489 490 /* decrement the offset */ 491 offset -= (nz + 1); 492 493 /* first, set the diagonal elements */ 494 AjUp[offset] = (PetscInt)i; 495 AAUp[offset] = (MatScalar)1. / v[nz]; 496 AiUp[i] = AiUp[i + 1] - (nz + 1); 497 498 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 499 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 500 } 501 502 /* allocate space for the triangular factor information */ 503 PetscCall(PetscNew(&upTriFactor)); 504 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 505 506 /* Create the matrix description */ 507 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 508 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 509 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 510 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 511 #else 512 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 513 #endif 514 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 515 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 516 517 /* set the operation */ 518 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 519 520 /* set the matrix */ 521 upTriFactor->csrMat = new CsrMatrix; 522 upTriFactor->csrMat->num_rows = n; 523 upTriFactor->csrMat->num_cols = n; 524 upTriFactor->csrMat->num_entries = nzUpper; 525 526 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 527 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 528 529 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 530 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 531 532 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 533 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 534 535 /* Create the solve analysis information */ 536 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 537 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 538 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 539 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 540 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 541 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 542 #endif 543 544 /* perform the solve analysis */ 545 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 546 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 547 548 PetscCallCUDA(WaitForCUDA()); 549 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 550 551 /* assign the pointer */ 552 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 553 upTriFactor->AA_h = AAUp; 554 PetscCallCUDA(cudaFreeHost(AiUp)); 555 PetscCallCUDA(cudaFreeHost(AjUp)); 556 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 557 } else { 558 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 559 /* Fill the upper triangular matrix */ 560 offset = nzUpper; 561 for (i = n - 1; i >= 0; i--) { 562 v = aa + adiag[i + 1] + 1; 563 564 /* number of elements NOT on the diagonal */ 565 nz = adiag[i] - adiag[i + 1] - 1; 566 567 /* decrement the offset */ 568 offset -= (nz + 1); 569 570 /* first, set the diagonal elements */ 571 upTriFactor->AA_h[offset] = 1. / v[nz]; 572 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 573 } 574 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 575 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 576 } 577 } catch (char *ex) { 578 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 579 } 580 } 581 PetscFunctionReturn(PETSC_SUCCESS); 582 } 583 #endif 584 585 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 586 { 587 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 588 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 589 IS isrow = a->row, iscol = a->icol; 590 PetscBool row_identity, col_identity; 591 PetscInt n = A->rmap->n; 592 593 PetscFunctionBegin; 594 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 595 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 596 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 597 #else 598 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 599 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 600 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 601 #endif 602 603 cusparseTriFactors->nnz = a->nz; 604 605 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 606 /* lower triangular indices */ 607 PetscCall(ISIdentity(isrow, &row_identity)); 608 if (!row_identity && !cusparseTriFactors->rpermIndices) { 609 const PetscInt *r; 610 611 PetscCall(ISGetIndices(isrow, &r)); 612 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 613 cusparseTriFactors->rpermIndices->assign(r, r + n); 614 PetscCall(ISRestoreIndices(isrow, &r)); 615 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 616 } 617 618 /* upper triangular indices */ 619 PetscCall(ISIdentity(iscol, &col_identity)); 620 if (!col_identity && !cusparseTriFactors->cpermIndices) { 621 const PetscInt *c; 622 623 PetscCall(ISGetIndices(iscol, &c)); 624 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 625 cusparseTriFactors->cpermIndices->assign(c, c + n); 626 PetscCall(ISRestoreIndices(iscol, &c)); 627 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 628 } 629 PetscFunctionReturn(PETSC_SUCCESS); 630 } 631 632 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 633 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 634 { 635 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 636 PetscInt m = A->rmap->n; 637 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 638 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 639 const MatScalar *Aa = a->a; 640 PetscInt *Mj, Mnz; 641 PetscScalar *Ma, *D; 642 643 PetscFunctionBegin; 644 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 645 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 646 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 647 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 648 Mnz = Ai[m]; // Unz (with the unit diagonal) 649 PetscCall(PetscMalloc1(Mnz, &Ma)); 650 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 651 PetscCall(PetscMalloc1(m, &D)); // the diagonal 652 for (PetscInt i = 0; i < m; i++) { 653 PetscInt ulen = Ai[i + 1] - Ai[i]; 654 Mj[Ai[i]] = i; // diagonal entry 655 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 656 } 657 // Copy M (U) from host to device 658 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 659 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 660 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 661 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 662 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 663 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 664 665 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 666 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 667 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 668 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 669 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 670 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 671 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 672 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 673 674 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 675 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 676 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 677 678 // Allocate work vectors in SpSv 679 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 680 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 681 682 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 683 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 684 685 // Query buffer sizes for SpSV and then allocate buffers 686 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 687 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 688 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 689 690 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 691 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 692 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 693 694 // Record for reuse 695 fs->csrVal_h = Ma; 696 fs->diag_h = D; 697 PetscCall(PetscFree(Mj)); 698 } 699 // Copy the value 700 Ma = fs->csrVal_h; 701 D = fs->diag_h; 702 Mnz = Ai[m]; 703 for (PetscInt i = 0; i < m; i++) { 704 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 705 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 706 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 707 } 708 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 709 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 710 711 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 712 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 713 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 714 } 715 PetscFunctionReturn(PETSC_SUCCESS); 716 } 717 718 // Solve Ut D U x = b 719 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 720 { 721 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 722 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 723 const PetscScalar *barray; 724 PetscScalar *xarray; 725 thrust::device_ptr<const PetscScalar> bGPU; 726 thrust::device_ptr<PetscScalar> xGPU; 727 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 728 PetscInt m = A->rmap->n; 729 730 PetscFunctionBegin; 731 PetscCall(PetscLogGpuTimeBegin()); 732 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 733 PetscCall(VecCUDAGetArrayRead(b, &barray)); 734 xGPU = thrust::device_pointer_cast(xarray); 735 bGPU = thrust::device_pointer_cast(barray); 736 737 // Reorder b with the row permutation if needed, and wrap the result in fs->X 738 if (fs->rpermIndices) { 739 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 740 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 741 } else { 742 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 743 } 744 745 // Solve Ut Y = X 746 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 747 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 748 749 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 750 // It is basically a vector element-wise multiplication, but cublas does not have it! 751 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 752 753 // Solve U X = Y 754 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 755 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 756 } else { 757 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 758 } 759 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 760 761 // Reorder X with the column permutation if needed, and put the result back to x 762 if (fs->cpermIndices) { 763 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 764 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 765 } 766 767 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 768 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 769 PetscCall(PetscLogGpuTimeEnd()); 770 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 771 PetscFunctionReturn(PETSC_SUCCESS); 772 } 773 #else 774 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 775 { 776 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 777 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 778 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 779 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 780 PetscInt *AiUp, *AjUp; 781 PetscScalar *AAUp; 782 PetscScalar *AALo; 783 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 784 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 785 const PetscInt *ai = b->i, *aj = b->j, *vj; 786 const MatScalar *aa = b->a, *v; 787 788 PetscFunctionBegin; 789 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 790 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 791 try { 792 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 793 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 794 if (!upTriFactor && !loTriFactor) { 795 /* Allocate Space for the upper triangular matrix */ 796 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 797 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 798 799 /* Fill the upper triangular matrix */ 800 AiUp[0] = (PetscInt)0; 801 AiUp[n] = nzUpper; 802 offset = 0; 803 for (i = 0; i < n; i++) { 804 /* set the pointers */ 805 v = aa + ai[i]; 806 vj = aj + ai[i]; 807 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 808 809 /* first, set the diagonal elements */ 810 AjUp[offset] = (PetscInt)i; 811 AAUp[offset] = (MatScalar)1.0 / v[nz]; 812 AiUp[i] = offset; 813 AALo[offset] = (MatScalar)1.0 / v[nz]; 814 815 offset += 1; 816 if (nz > 0) { 817 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 818 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 819 for (j = offset; j < offset + nz; j++) { 820 AAUp[j] = -AAUp[j]; 821 AALo[j] = AAUp[j] / v[nz]; 822 } 823 offset += nz; 824 } 825 } 826 827 /* allocate space for the triangular factor information */ 828 PetscCall(PetscNew(&upTriFactor)); 829 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 830 831 /* Create the matrix description */ 832 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 833 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 834 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 835 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 836 #else 837 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 838 #endif 839 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 840 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 841 842 /* set the matrix */ 843 upTriFactor->csrMat = new CsrMatrix; 844 upTriFactor->csrMat->num_rows = A->rmap->n; 845 upTriFactor->csrMat->num_cols = A->cmap->n; 846 upTriFactor->csrMat->num_entries = a->nz; 847 848 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 849 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 850 851 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 852 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 853 854 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 855 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 856 857 /* set the operation */ 858 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 859 860 /* Create the solve analysis information */ 861 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 862 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 863 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 864 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 865 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 866 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 867 #endif 868 869 /* perform the solve analysis */ 870 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 871 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 872 873 PetscCallCUDA(WaitForCUDA()); 874 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 875 876 /* assign the pointer */ 877 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 878 879 /* allocate space for the triangular factor information */ 880 PetscCall(PetscNew(&loTriFactor)); 881 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 882 883 /* Create the matrix description */ 884 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 885 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 886 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 887 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 888 #else 889 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 890 #endif 891 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 892 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 893 894 /* set the operation */ 895 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 896 897 /* set the matrix */ 898 loTriFactor->csrMat = new CsrMatrix; 899 loTriFactor->csrMat->num_rows = A->rmap->n; 900 loTriFactor->csrMat->num_cols = A->cmap->n; 901 loTriFactor->csrMat->num_entries = a->nz; 902 903 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 904 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 905 906 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 907 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 908 909 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 910 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 911 912 /* Create the solve analysis information */ 913 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 914 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 915 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 916 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 917 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 918 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 919 #endif 920 921 /* perform the solve analysis */ 922 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 923 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 924 925 PetscCallCUDA(WaitForCUDA()); 926 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 927 928 /* assign the pointer */ 929 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 930 931 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 932 PetscCallCUDA(cudaFreeHost(AiUp)); 933 PetscCallCUDA(cudaFreeHost(AjUp)); 934 } else { 935 /* Fill the upper triangular matrix */ 936 offset = 0; 937 for (i = 0; i < n; i++) { 938 /* set the pointers */ 939 v = aa + ai[i]; 940 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 941 942 /* first, set the diagonal elements */ 943 AAUp[offset] = 1.0 / v[nz]; 944 AALo[offset] = 1.0 / v[nz]; 945 946 offset += 1; 947 if (nz > 0) { 948 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 949 for (j = offset; j < offset + nz; j++) { 950 AAUp[j] = -AAUp[j]; 951 AALo[j] = AAUp[j] / v[nz]; 952 } 953 offset += nz; 954 } 955 } 956 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 957 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 958 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 959 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 960 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 961 } 962 PetscCallCUDA(cudaFreeHost(AAUp)); 963 PetscCallCUDA(cudaFreeHost(AALo)); 964 } catch (char *ex) { 965 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 966 } 967 } 968 PetscFunctionReturn(PETSC_SUCCESS); 969 } 970 #endif 971 972 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 973 { 974 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 975 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 976 IS ip = a->row; 977 PetscBool perm_identity; 978 PetscInt n = A->rmap->n; 979 980 PetscFunctionBegin; 981 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 982 983 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 984 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 985 #else 986 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 987 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 988 #endif 989 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 990 991 A->offloadmask = PETSC_OFFLOAD_BOTH; 992 993 /* lower triangular indices */ 994 PetscCall(ISIdentity(ip, &perm_identity)); 995 if (!perm_identity) { 996 IS iip; 997 const PetscInt *irip, *rip; 998 999 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 1000 PetscCall(ISGetIndices(iip, &irip)); 1001 PetscCall(ISGetIndices(ip, &rip)); 1002 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1003 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1004 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1005 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1006 PetscCall(ISRestoreIndices(iip, &irip)); 1007 PetscCall(ISDestroy(&iip)); 1008 PetscCall(ISRestoreIndices(ip, &rip)); 1009 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1010 } 1011 PetscFunctionReturn(PETSC_SUCCESS); 1012 } 1013 1014 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1015 { 1016 PetscFunctionBegin; 1017 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1018 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1019 B->offloadmask = PETSC_OFFLOAD_CPU; 1020 1021 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1022 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1023 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1024 #else 1025 /* determine which version of MatSolve needs to be used. */ 1026 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1027 IS ip = b->row; 1028 PetscBool perm_identity; 1029 1030 PetscCall(ISIdentity(ip, &perm_identity)); 1031 if (perm_identity) { 1032 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1033 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1034 } else { 1035 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1036 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1037 } 1038 #endif 1039 B->ops->matsolve = NULL; 1040 B->ops->matsolvetranspose = NULL; 1041 1042 /* get the triangular factors */ 1043 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1044 PetscFunctionReturn(PETSC_SUCCESS); 1045 } 1046 1047 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1048 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1049 { 1050 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1051 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1052 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1053 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1054 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1055 cusparseIndexBase_t indexBase; 1056 cusparseMatrixType_t matrixType; 1057 cusparseFillMode_t fillMode; 1058 cusparseDiagType_t diagType; 1059 1060 PetscFunctionBegin; 1061 /* allocate space for the transpose of the lower triangular factor */ 1062 PetscCall(PetscNew(&loTriFactorT)); 1063 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1064 1065 /* set the matrix descriptors of the lower triangular factor */ 1066 matrixType = cusparseGetMatType(loTriFactor->descr); 1067 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1068 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1069 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1070 1071 /* Create the matrix description */ 1072 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1073 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1074 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1075 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1076 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1077 1078 /* set the operation */ 1079 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1080 1081 /* allocate GPU space for the CSC of the lower triangular factor*/ 1082 loTriFactorT->csrMat = new CsrMatrix; 1083 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1084 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1085 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1086 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1087 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1088 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1089 1090 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1091 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1092 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1093 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1094 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1095 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1096 #endif 1097 1098 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1099 { 1100 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1101 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1102 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1103 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1104 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1105 #else 1106 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1107 #endif 1108 PetscCallCUSPARSE(stat); 1109 } 1110 1111 PetscCallCUDA(WaitForCUDA()); 1112 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1113 1114 /* Create the solve analysis information */ 1115 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1116 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1117 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1118 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1119 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1120 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1121 #endif 1122 1123 /* perform the solve analysis */ 1124 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1125 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1126 1127 PetscCallCUDA(WaitForCUDA()); 1128 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1129 1130 /* assign the pointer */ 1131 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1132 1133 /*********************************************/ 1134 /* Now the Transpose of the Upper Tri Factor */ 1135 /*********************************************/ 1136 1137 /* allocate space for the transpose of the upper triangular factor */ 1138 PetscCall(PetscNew(&upTriFactorT)); 1139 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1140 1141 /* set the matrix descriptors of the upper triangular factor */ 1142 matrixType = cusparseGetMatType(upTriFactor->descr); 1143 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1144 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1145 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1146 1147 /* Create the matrix description */ 1148 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1149 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1150 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1151 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1152 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1153 1154 /* set the operation */ 1155 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1156 1157 /* allocate GPU space for the CSC of the upper triangular factor*/ 1158 upTriFactorT->csrMat = new CsrMatrix; 1159 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1160 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1161 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1162 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1163 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1164 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1165 1166 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1167 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1168 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1169 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1170 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1171 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1172 #endif 1173 1174 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1175 { 1176 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1177 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1178 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1180 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1181 #else 1182 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1183 #endif 1184 PetscCallCUSPARSE(stat); 1185 } 1186 1187 PetscCallCUDA(WaitForCUDA()); 1188 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1189 1190 /* Create the solve analysis information */ 1191 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1192 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1193 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1194 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1195 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1196 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1197 #endif 1198 1199 /* perform the solve analysis */ 1200 /* christ, would it have killed you to put this stuff in a function????????? */ 1201 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1202 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1203 1204 PetscCallCUDA(WaitForCUDA()); 1205 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1206 1207 /* assign the pointer */ 1208 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 #endif 1212 1213 struct PetscScalarToPetscInt { 1214 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1215 }; 1216 1217 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1218 { 1219 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1220 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1221 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1222 cusparseStatus_t stat; 1223 cusparseIndexBase_t indexBase; 1224 1225 PetscFunctionBegin; 1226 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1227 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1228 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1229 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1230 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1231 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1232 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1233 PetscCall(PetscLogGpuTimeBegin()); 1234 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1235 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1236 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1237 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1238 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1239 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1240 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1241 1242 /* set alpha and beta */ 1243 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1244 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1245 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1246 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1247 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1248 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1249 1250 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1251 CsrMatrix *matrixT = new CsrMatrix; 1252 matstructT->mat = matrixT; 1253 matrixT->num_rows = A->cmap->n; 1254 matrixT->num_cols = A->rmap->n; 1255 matrixT->num_entries = a->nz; 1256 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1257 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1258 matrixT->values = new THRUSTARRAY(a->nz); 1259 1260 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1261 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1262 1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1264 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1265 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1266 indexBase, cusparse_scalartype); 1267 PetscCallCUSPARSE(stat); 1268 #else 1269 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1270 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1271 1272 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1273 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1274 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1275 */ 1276 if (matrixT->num_entries) { 1277 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1278 PetscCallCUSPARSE(stat); 1279 1280 } else { 1281 matstructT->matDescr = NULL; 1282 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1283 } 1284 #endif 1285 #endif 1286 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1287 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1288 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1289 #else 1290 CsrMatrix *temp = new CsrMatrix; 1291 CsrMatrix *tempT = new CsrMatrix; 1292 /* First convert HYB to CSR */ 1293 temp->num_rows = A->rmap->n; 1294 temp->num_cols = A->cmap->n; 1295 temp->num_entries = a->nz; 1296 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1297 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1298 temp->values = new THRUSTARRAY(a->nz); 1299 1300 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1301 PetscCallCUSPARSE(stat); 1302 1303 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1304 tempT->num_rows = A->rmap->n; 1305 tempT->num_cols = A->cmap->n; 1306 tempT->num_entries = a->nz; 1307 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1308 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1309 tempT->values = new THRUSTARRAY(a->nz); 1310 1311 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1312 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1313 PetscCallCUSPARSE(stat); 1314 1315 /* Last, convert CSC to HYB */ 1316 cusparseHybMat_t hybMat; 1317 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1318 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1319 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1320 PetscCallCUSPARSE(stat); 1321 1322 /* assign the pointer */ 1323 matstructT->mat = hybMat; 1324 A->transupdated = PETSC_TRUE; 1325 /* delete temporaries */ 1326 if (tempT) { 1327 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1328 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1329 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1330 delete (CsrMatrix *)tempT; 1331 } 1332 if (temp) { 1333 if (temp->values) delete (THRUSTARRAY *)temp->values; 1334 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1335 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1336 delete (CsrMatrix *)temp; 1337 } 1338 #endif 1339 } 1340 } 1341 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1342 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1343 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1344 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1345 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1346 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1347 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1348 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1349 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1350 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1351 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1352 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1353 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1354 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1355 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1356 } 1357 if (!cusparsestruct->csr2csc_i) { 1358 THRUSTARRAY csr2csc_a(matrix->num_entries); 1359 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1360 1361 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1362 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1363 void *csr2cscBuffer; 1364 size_t csr2cscBufferSize; 1365 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1366 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1367 PetscCallCUSPARSE(stat); 1368 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1369 #endif 1370 1371 if (matrix->num_entries) { 1372 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1373 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1374 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1375 1376 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1377 should be filled with indexBase. So I just take a shortcut here. 1378 */ 1379 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1380 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1381 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1382 PetscCallCUSPARSE(stat); 1383 #else 1384 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1385 PetscCallCUSPARSE(stat); 1386 #endif 1387 } else { 1388 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1389 } 1390 1391 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1392 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1393 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1394 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1395 #endif 1396 } 1397 PetscCallThrust( 1398 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1399 } 1400 PetscCall(PetscLogGpuTimeEnd()); 1401 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1402 /* the compressed row indices is not used for matTranspose */ 1403 matstructT->cprowIndices = NULL; 1404 /* assign the pointer */ 1405 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1406 A->transupdated = PETSC_TRUE; 1407 PetscFunctionReturn(PETSC_SUCCESS); 1408 } 1409 1410 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1411 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1412 { 1413 const PetscScalar *barray; 1414 PetscScalar *xarray; 1415 thrust::device_ptr<const PetscScalar> bGPU; 1416 thrust::device_ptr<PetscScalar> xGPU; 1417 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1418 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1419 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1420 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1421 PetscInt m = A->rmap->n; 1422 1423 PetscFunctionBegin; 1424 PetscCall(PetscLogGpuTimeBegin()); 1425 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1426 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1427 xGPU = thrust::device_pointer_cast(xarray); 1428 bGPU = thrust::device_pointer_cast(barray); 1429 1430 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1431 if (fs->rpermIndices) { 1432 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1433 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1434 } else { 1435 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1436 } 1437 1438 // Solve L Y = X 1439 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1440 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1441 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1442 1443 // Solve U X = Y 1444 if (fs->cpermIndices) { 1445 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1446 } else { 1447 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1448 } 1449 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1450 1451 // Reorder X with the column permutation if needed, and put the result back to x 1452 if (fs->cpermIndices) { 1453 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1454 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1455 } 1456 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1457 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1458 PetscCall(PetscLogGpuTimeEnd()); 1459 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1460 PetscFunctionReturn(PETSC_SUCCESS); 1461 } 1462 1463 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1464 { 1465 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1466 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1467 const PetscScalar *barray; 1468 PetscScalar *xarray; 1469 thrust::device_ptr<const PetscScalar> bGPU; 1470 thrust::device_ptr<PetscScalar> xGPU; 1471 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1472 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1473 PetscInt m = A->rmap->n; 1474 1475 PetscFunctionBegin; 1476 PetscCall(PetscLogGpuTimeBegin()); 1477 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1478 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1479 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1480 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1481 1482 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1483 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1484 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1485 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1486 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1487 } 1488 1489 if (!fs->updatedTransposeSpSVAnalysis) { 1490 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1491 1492 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1493 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1494 } 1495 1496 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1497 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1498 xGPU = thrust::device_pointer_cast(xarray); 1499 bGPU = thrust::device_pointer_cast(barray); 1500 1501 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1502 if (fs->rpermIndices) { 1503 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1504 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1505 } else { 1506 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1507 } 1508 1509 // Solve Ut Y = X 1510 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1511 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1512 1513 // Solve Lt X = Y 1514 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1515 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1516 } else { 1517 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1518 } 1519 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1520 1521 // Reorder X with the column permutation if needed, and put the result back to x 1522 if (fs->cpermIndices) { 1523 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1524 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1525 } 1526 1527 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1528 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1529 PetscCall(PetscLogGpuTimeEnd()); 1530 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1531 PetscFunctionReturn(PETSC_SUCCESS); 1532 } 1533 #else 1534 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1535 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1536 { 1537 PetscInt n = xx->map->n; 1538 const PetscScalar *barray; 1539 PetscScalar *xarray; 1540 thrust::device_ptr<const PetscScalar> bGPU; 1541 thrust::device_ptr<PetscScalar> xGPU; 1542 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1543 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1544 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1545 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1546 1547 PetscFunctionBegin; 1548 /* Analyze the matrix and create the transpose ... on the fly */ 1549 if (!loTriFactorT && !upTriFactorT) { 1550 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1551 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1552 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1553 } 1554 1555 /* Get the GPU pointers */ 1556 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1557 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1558 xGPU = thrust::device_pointer_cast(xarray); 1559 bGPU = thrust::device_pointer_cast(barray); 1560 1561 PetscCall(PetscLogGpuTimeBegin()); 1562 /* First, reorder with the row permutation */ 1563 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1564 1565 /* First, solve U */ 1566 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1567 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1568 1569 /* Then, solve L */ 1570 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1571 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1572 1573 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1574 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1575 1576 /* Copy the temporary to the full solution. */ 1577 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1578 1579 /* restore */ 1580 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1581 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1582 PetscCall(PetscLogGpuTimeEnd()); 1583 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1584 PetscFunctionReturn(PETSC_SUCCESS); 1585 } 1586 1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1588 { 1589 const PetscScalar *barray; 1590 PetscScalar *xarray; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1595 1596 PetscFunctionBegin; 1597 /* Analyze the matrix and create the transpose ... on the fly */ 1598 if (!loTriFactorT && !upTriFactorT) { 1599 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1600 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1601 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1602 } 1603 1604 /* Get the GPU pointers */ 1605 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1606 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1607 1608 PetscCall(PetscLogGpuTimeBegin()); 1609 /* First, solve U */ 1610 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1611 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1612 1613 /* Then, solve L */ 1614 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1615 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1616 1617 /* restore */ 1618 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1619 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1620 PetscCall(PetscLogGpuTimeEnd()); 1621 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1622 PetscFunctionReturn(PETSC_SUCCESS); 1623 } 1624 1625 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1626 { 1627 const PetscScalar *barray; 1628 PetscScalar *xarray; 1629 thrust::device_ptr<const PetscScalar> bGPU; 1630 thrust::device_ptr<PetscScalar> xGPU; 1631 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1633 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1634 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1635 1636 PetscFunctionBegin; 1637 /* Get the GPU pointers */ 1638 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1639 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1640 xGPU = thrust::device_pointer_cast(xarray); 1641 bGPU = thrust::device_pointer_cast(barray); 1642 1643 PetscCall(PetscLogGpuTimeBegin()); 1644 /* First, reorder with the row permutation */ 1645 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1646 1647 /* Next, solve L */ 1648 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1649 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1650 1651 /* Then, solve U */ 1652 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1653 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1654 1655 /* Last, reorder with the column permutation */ 1656 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1657 1658 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1659 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1660 PetscCall(PetscLogGpuTimeEnd()); 1661 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1662 PetscFunctionReturn(PETSC_SUCCESS); 1663 } 1664 1665 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1666 { 1667 const PetscScalar *barray; 1668 PetscScalar *xarray; 1669 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1670 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1671 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1672 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1673 1674 PetscFunctionBegin; 1675 /* Get the GPU pointers */ 1676 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1677 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1678 1679 PetscCall(PetscLogGpuTimeBegin()); 1680 /* First, solve L */ 1681 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1682 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1683 1684 /* Next, solve U */ 1685 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1686 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1687 1688 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1689 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1690 PetscCall(PetscLogGpuTimeEnd()); 1691 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1692 PetscFunctionReturn(PETSC_SUCCESS); 1693 } 1694 #endif 1695 1696 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1697 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1698 { 1699 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1700 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1701 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1702 CsrMatrix *Acsr; 1703 PetscInt m, nz; 1704 PetscBool flg; 1705 1706 PetscFunctionBegin; 1707 if (PetscDefined(USE_DEBUG)) { 1708 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1709 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1710 } 1711 1712 /* Copy A's value to fact */ 1713 m = fact->rmap->n; 1714 nz = aij->nz; 1715 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1716 Acsr = (CsrMatrix *)Acusp->mat->mat; 1717 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1718 1719 PetscCall(PetscLogGpuTimeBegin()); 1720 /* Factorize fact inplace */ 1721 if (m) 1722 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1723 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1724 if (PetscDefined(USE_DEBUG)) { 1725 int numerical_zero; 1726 cusparseStatus_t status; 1727 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1728 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1729 } 1730 1731 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1732 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1733 */ 1734 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1735 1736 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1737 1738 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1739 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1740 1741 fact->offloadmask = PETSC_OFFLOAD_GPU; 1742 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1743 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1744 fact->ops->matsolve = NULL; 1745 fact->ops->matsolvetranspose = NULL; 1746 PetscCall(PetscLogGpuTimeEnd()); 1747 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1748 PetscFunctionReturn(PETSC_SUCCESS); 1749 } 1750 1751 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1752 { 1753 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1754 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1755 PetscInt m, nz; 1756 1757 PetscFunctionBegin; 1758 if (PetscDefined(USE_DEBUG)) { 1759 PetscInt i; 1760 PetscBool flg, missing; 1761 1762 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1763 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1764 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1765 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1766 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1767 } 1768 1769 /* Free the old stale stuff */ 1770 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1771 1772 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1773 but they will not be used. Allocate them just for easy debugging. 1774 */ 1775 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1776 1777 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1778 fact->factortype = MAT_FACTOR_ILU; 1779 fact->info.factor_mallocs = 0; 1780 fact->info.fill_ratio_given = info->fill; 1781 fact->info.fill_ratio_needed = 1.0; 1782 1783 aij->row = NULL; 1784 aij->col = NULL; 1785 1786 /* ====================================================================== */ 1787 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1788 /* We'll do in-place factorization on fact */ 1789 /* ====================================================================== */ 1790 const int *Ai, *Aj; 1791 1792 m = fact->rmap->n; 1793 nz = aij->nz; 1794 1795 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1796 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1797 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1798 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1799 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1800 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1801 1802 /* ====================================================================== */ 1803 /* Create descriptors for M, L, U */ 1804 /* ====================================================================== */ 1805 cusparseFillMode_t fillMode; 1806 cusparseDiagType_t diagType; 1807 1808 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1809 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1810 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1811 1812 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1813 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1814 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1815 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1816 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1817 */ 1818 fillMode = CUSPARSE_FILL_MODE_LOWER; 1819 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1820 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1821 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1822 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1823 1824 fillMode = CUSPARSE_FILL_MODE_UPPER; 1825 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1826 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1827 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1828 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1829 1830 /* ========================================================================= */ 1831 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1832 /* ========================================================================= */ 1833 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1834 if (m) 1835 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1836 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1837 1838 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1839 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1840 1841 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1842 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1843 1844 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1845 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1846 1847 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1848 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1849 1850 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1851 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1852 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1853 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1854 */ 1855 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1856 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1857 fs->spsvBuffer_L = fs->factBuffer_M; 1858 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1859 } else { 1860 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1861 fs->spsvBuffer_U = fs->factBuffer_M; 1862 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1863 } 1864 1865 /* ========================================================================== */ 1866 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1867 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1868 /* ========================================================================== */ 1869 int structural_zero; 1870 cusparseStatus_t status; 1871 1872 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1873 if (m) 1874 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1875 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1876 if (PetscDefined(USE_DEBUG)) { 1877 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1878 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1879 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1880 } 1881 1882 /* Estimate FLOPs of the numeric factorization */ 1883 { 1884 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1885 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1886 PetscLogDouble flops = 0.0; 1887 1888 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1889 Ai = Aseq->i; 1890 Adiag = Aseq->diag; 1891 for (PetscInt i = 0; i < m; i++) { 1892 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1893 nzRow = Ai[i + 1] - Ai[i]; 1894 nzLeft = Adiag[i] - Ai[i]; 1895 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1896 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1897 */ 1898 nzLeft = (nzRow - 1) / 2; 1899 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1900 } 1901 } 1902 fs->numericFactFlops = flops; 1903 } 1904 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1905 PetscFunctionReturn(PETSC_SUCCESS); 1906 } 1907 1908 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1909 { 1910 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1911 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1912 const PetscScalar *barray; 1913 PetscScalar *xarray; 1914 1915 PetscFunctionBegin; 1916 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1917 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1918 PetscCall(PetscLogGpuTimeBegin()); 1919 1920 /* Solve L*y = b */ 1921 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1922 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1923 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1924 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1925 1926 /* Solve Lt*x = y */ 1927 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1928 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1929 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1930 1931 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1932 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1933 1934 PetscCall(PetscLogGpuTimeEnd()); 1935 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1936 PetscFunctionReturn(PETSC_SUCCESS); 1937 } 1938 1939 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1940 { 1941 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1942 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1943 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1944 CsrMatrix *Acsr; 1945 PetscInt m, nz; 1946 PetscBool flg; 1947 1948 PetscFunctionBegin; 1949 if (PetscDefined(USE_DEBUG)) { 1950 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1951 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1952 } 1953 1954 /* Copy A's value to fact */ 1955 m = fact->rmap->n; 1956 nz = aij->nz; 1957 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1958 Acsr = (CsrMatrix *)Acusp->mat->mat; 1959 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1960 1961 /* Factorize fact inplace */ 1962 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1963 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1964 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1965 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1966 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1967 */ 1968 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1969 if (PetscDefined(USE_DEBUG)) { 1970 int numerical_zero; 1971 cusparseStatus_t status; 1972 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1973 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1974 } 1975 1976 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1977 1978 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1979 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1980 */ 1981 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1982 1983 fact->offloadmask = PETSC_OFFLOAD_GPU; 1984 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1985 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1986 fact->ops->matsolve = NULL; 1987 fact->ops->matsolvetranspose = NULL; 1988 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1989 PetscFunctionReturn(PETSC_SUCCESS); 1990 } 1991 1992 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1993 { 1994 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1995 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1996 PetscInt m, nz; 1997 1998 PetscFunctionBegin; 1999 if (PetscDefined(USE_DEBUG)) { 2000 PetscInt i; 2001 PetscBool flg, missing; 2002 2003 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2004 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2005 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2006 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2007 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2008 } 2009 2010 /* Free the old stale stuff */ 2011 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2012 2013 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2014 but they will not be used. Allocate them just for easy debugging. 2015 */ 2016 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2017 2018 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2019 fact->factortype = MAT_FACTOR_ICC; 2020 fact->info.factor_mallocs = 0; 2021 fact->info.fill_ratio_given = info->fill; 2022 fact->info.fill_ratio_needed = 1.0; 2023 2024 aij->row = NULL; 2025 aij->col = NULL; 2026 2027 /* ====================================================================== */ 2028 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2029 /* We'll do in-place factorization on fact */ 2030 /* ====================================================================== */ 2031 const int *Ai, *Aj; 2032 2033 m = fact->rmap->n; 2034 nz = aij->nz; 2035 2036 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2037 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2038 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2039 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2040 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2041 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2042 2043 /* ====================================================================== */ 2044 /* Create mat descriptors for M, L */ 2045 /* ====================================================================== */ 2046 cusparseFillMode_t fillMode; 2047 cusparseDiagType_t diagType; 2048 2049 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2050 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2051 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2052 2053 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2054 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2055 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2056 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2057 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2058 */ 2059 fillMode = CUSPARSE_FILL_MODE_LOWER; 2060 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2061 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2062 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2063 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2064 2065 /* ========================================================================= */ 2066 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2067 /* ========================================================================= */ 2068 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2069 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2070 2071 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2072 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2073 2074 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2075 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2076 2077 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2078 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2079 2080 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2081 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2082 2083 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2084 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2085 */ 2086 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2087 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2088 fs->spsvBuffer_L = fs->factBuffer_M; 2089 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2090 } else { 2091 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2092 fs->spsvBuffer_Lt = fs->factBuffer_M; 2093 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2094 } 2095 2096 /* ========================================================================== */ 2097 /* Perform analysis of ic0 on M */ 2098 /* The lower triangular part of M has the same sparsity pattern as L */ 2099 /* ========================================================================== */ 2100 int structural_zero; 2101 cusparseStatus_t status; 2102 2103 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2104 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2105 if (PetscDefined(USE_DEBUG)) { 2106 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2107 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2108 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2109 } 2110 2111 /* Estimate FLOPs of the numeric factorization */ 2112 { 2113 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2114 PetscInt *Ai, nzRow, nzLeft; 2115 PetscLogDouble flops = 0.0; 2116 2117 Ai = Aseq->i; 2118 for (PetscInt i = 0; i < m; i++) { 2119 nzRow = Ai[i + 1] - Ai[i]; 2120 if (nzRow > 1) { 2121 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2122 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2123 */ 2124 nzLeft = (nzRow - 1) / 2; 2125 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2126 } 2127 } 2128 fs->numericFactFlops = flops; 2129 } 2130 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2131 PetscFunctionReturn(PETSC_SUCCESS); 2132 } 2133 #endif 2134 2135 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2136 { 2137 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2138 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2139 2140 PetscFunctionBegin; 2141 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2142 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2143 B->offloadmask = PETSC_OFFLOAD_CPU; 2144 2145 if (!cusparsestruct->use_cpu_solve) { 2146 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2147 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2148 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2149 #else 2150 /* determine which version of MatSolve needs to be used. */ 2151 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2152 IS isrow = b->row, iscol = b->col; 2153 PetscBool row_identity, col_identity; 2154 2155 PetscCall(ISIdentity(isrow, &row_identity)); 2156 PetscCall(ISIdentity(iscol, &col_identity)); 2157 if (row_identity && col_identity) { 2158 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2159 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2160 } else { 2161 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2162 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2163 } 2164 #endif 2165 } 2166 B->ops->matsolve = NULL; 2167 B->ops->matsolvetranspose = NULL; 2168 2169 /* get the triangular factors */ 2170 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2171 PetscFunctionReturn(PETSC_SUCCESS); 2172 } 2173 2174 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2175 { 2176 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2177 2178 PetscFunctionBegin; 2179 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2180 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2181 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2182 PetscFunctionReturn(PETSC_SUCCESS); 2183 } 2184 2185 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2186 { 2187 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2188 2189 PetscFunctionBegin; 2190 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2191 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2192 if (cusparseTriFactors->factorizeOnDevice) { 2193 PetscCall(ISIdentity(isrow, &row_identity)); 2194 PetscCall(ISIdentity(iscol, &col_identity)); 2195 } 2196 if (!info->levels && row_identity && col_identity) { 2197 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2198 } else 2199 #endif 2200 { 2201 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2202 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2203 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2204 } 2205 PetscFunctionReturn(PETSC_SUCCESS); 2206 } 2207 2208 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2209 { 2210 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2211 2212 PetscFunctionBegin; 2213 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2214 PetscBool perm_identity = PETSC_FALSE; 2215 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2216 if (!info->levels && perm_identity) { 2217 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2218 } else 2219 #endif 2220 { 2221 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2222 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2223 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2224 } 2225 PetscFunctionReturn(PETSC_SUCCESS); 2226 } 2227 2228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2229 { 2230 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2231 2232 PetscFunctionBegin; 2233 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2234 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2235 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2236 PetscFunctionReturn(PETSC_SUCCESS); 2237 } 2238 2239 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2240 { 2241 PetscFunctionBegin; 2242 *type = MATSOLVERCUSPARSE; 2243 PetscFunctionReturn(PETSC_SUCCESS); 2244 } 2245 2246 /*MC 2247 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2248 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2249 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2250 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2251 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2252 algorithms are not recommended. This class does NOT support direct solver operations. 2253 2254 Level: beginner 2255 2256 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2257 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2258 M*/ 2259 2260 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2261 { 2262 PetscInt n = A->rmap->n; 2263 PetscBool factOnDevice, factOnHost; 2264 char *prefix; 2265 char factPlace[32] = "device"; /* the default */ 2266 2267 PetscFunctionBegin; 2268 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2269 PetscCall(MatSetSizes(*B, n, n, n, n)); 2270 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2271 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2272 2273 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2274 PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat"); 2275 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2276 PetscOptionsEnd(); 2277 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2278 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2279 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2280 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2281 2282 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2283 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2284 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2285 if (!A->boundtocpu) { 2286 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2287 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2288 } else { 2289 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2290 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2291 } 2292 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2293 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2294 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2295 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2296 if (!A->boundtocpu) { 2297 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2298 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2299 } else { 2300 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2301 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2302 } 2303 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2304 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2305 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2306 2307 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2308 (*B)->canuseordering = PETSC_TRUE; 2309 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2310 PetscFunctionReturn(PETSC_SUCCESS); 2311 } 2312 2313 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2314 { 2315 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2316 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2317 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2318 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2319 #endif 2320 2321 PetscFunctionBegin; 2322 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2323 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2324 if (A->factortype == MAT_FACTOR_NONE) { 2325 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2326 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2327 } 2328 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2329 else if (fs->csrVal) { 2330 /* We have a factorized matrix on device and are able to copy it to host */ 2331 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2332 } 2333 #endif 2334 else 2335 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2336 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2337 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2338 A->offloadmask = PETSC_OFFLOAD_BOTH; 2339 } 2340 PetscFunctionReturn(PETSC_SUCCESS); 2341 } 2342 2343 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2344 { 2345 PetscFunctionBegin; 2346 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2347 *array = ((Mat_SeqAIJ *)A->data)->a; 2348 PetscFunctionReturn(PETSC_SUCCESS); 2349 } 2350 2351 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2352 { 2353 PetscFunctionBegin; 2354 A->offloadmask = PETSC_OFFLOAD_CPU; 2355 *array = NULL; 2356 PetscFunctionReturn(PETSC_SUCCESS); 2357 } 2358 2359 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2360 { 2361 PetscFunctionBegin; 2362 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2363 *array = ((Mat_SeqAIJ *)A->data)->a; 2364 PetscFunctionReturn(PETSC_SUCCESS); 2365 } 2366 2367 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2368 { 2369 PetscFunctionBegin; 2370 *array = NULL; 2371 PetscFunctionReturn(PETSC_SUCCESS); 2372 } 2373 2374 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2375 { 2376 PetscFunctionBegin; 2377 *array = ((Mat_SeqAIJ *)A->data)->a; 2378 PetscFunctionReturn(PETSC_SUCCESS); 2379 } 2380 2381 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2382 { 2383 PetscFunctionBegin; 2384 A->offloadmask = PETSC_OFFLOAD_CPU; 2385 *array = NULL; 2386 PetscFunctionReturn(PETSC_SUCCESS); 2387 } 2388 2389 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2390 { 2391 Mat_SeqAIJCUSPARSE *cusp; 2392 CsrMatrix *matrix; 2393 2394 PetscFunctionBegin; 2395 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2396 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2397 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2398 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2399 matrix = (CsrMatrix *)cusp->mat->mat; 2400 2401 if (i) { 2402 #if !defined(PETSC_USE_64BIT_INDICES) 2403 *i = matrix->row_offsets->data().get(); 2404 #else 2405 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2406 #endif 2407 } 2408 if (j) { 2409 #if !defined(PETSC_USE_64BIT_INDICES) 2410 *j = matrix->column_indices->data().get(); 2411 #else 2412 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2413 #endif 2414 } 2415 if (a) *a = matrix->values->data().get(); 2416 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2417 PetscFunctionReturn(PETSC_SUCCESS); 2418 } 2419 2420 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2421 { 2422 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2423 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2424 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2425 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2426 cusparseStatus_t stat; 2427 PetscBool both = PETSC_TRUE; 2428 2429 PetscFunctionBegin; 2430 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2431 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2432 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2433 CsrMatrix *matrix; 2434 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2435 2436 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2437 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2438 matrix->values->assign(a->a, a->a + a->nz); 2439 PetscCallCUDA(WaitForCUDA()); 2440 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2441 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2442 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2443 } else { 2444 PetscInt nnz; 2445 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2446 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2447 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2448 delete cusparsestruct->workVector; 2449 delete cusparsestruct->rowoffsets_gpu; 2450 cusparsestruct->workVector = NULL; 2451 cusparsestruct->rowoffsets_gpu = NULL; 2452 try { 2453 if (a->compressedrow.use) { 2454 m = a->compressedrow.nrows; 2455 ii = a->compressedrow.i; 2456 ridx = a->compressedrow.rindex; 2457 } else { 2458 m = A->rmap->n; 2459 ii = a->i; 2460 ridx = NULL; 2461 } 2462 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2463 if (!a->a) { 2464 nnz = ii[m]; 2465 both = PETSC_FALSE; 2466 } else nnz = a->nz; 2467 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2468 2469 /* create cusparse matrix */ 2470 cusparsestruct->nrows = m; 2471 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2472 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2473 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2474 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2475 2476 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2477 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2478 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2479 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2480 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2481 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2482 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2483 2484 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2485 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2486 /* set the matrix */ 2487 CsrMatrix *mat = new CsrMatrix; 2488 mat->num_rows = m; 2489 mat->num_cols = A->cmap->n; 2490 mat->num_entries = nnz; 2491 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2492 mat->row_offsets->assign(ii, ii + m + 1); 2493 2494 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2495 mat->column_indices->assign(a->j, a->j + nnz); 2496 2497 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2498 if (a->a) mat->values->assign(a->a, a->a + nnz); 2499 2500 /* assign the pointer */ 2501 matstruct->mat = mat; 2502 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2503 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2504 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2505 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2506 PetscCallCUSPARSE(stat); 2507 } 2508 #endif 2509 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2510 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2511 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2512 #else 2513 CsrMatrix *mat = new CsrMatrix; 2514 mat->num_rows = m; 2515 mat->num_cols = A->cmap->n; 2516 mat->num_entries = nnz; 2517 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2518 mat->row_offsets->assign(ii, ii + m + 1); 2519 2520 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2521 mat->column_indices->assign(a->j, a->j + nnz); 2522 2523 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2524 if (a->a) mat->values->assign(a->a, a->a + nnz); 2525 2526 cusparseHybMat_t hybMat; 2527 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2528 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2529 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2530 PetscCallCUSPARSE(stat); 2531 /* assign the pointer */ 2532 matstruct->mat = hybMat; 2533 2534 if (mat) { 2535 if (mat->values) delete (THRUSTARRAY *)mat->values; 2536 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2537 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2538 delete (CsrMatrix *)mat; 2539 } 2540 #endif 2541 } 2542 2543 /* assign the compressed row indices */ 2544 if (a->compressedrow.use) { 2545 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2546 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2547 matstruct->cprowIndices->assign(ridx, ridx + m); 2548 tmp = m; 2549 } else { 2550 cusparsestruct->workVector = NULL; 2551 matstruct->cprowIndices = NULL; 2552 tmp = 0; 2553 } 2554 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2555 2556 /* assign the pointer */ 2557 cusparsestruct->mat = matstruct; 2558 } catch (char *ex) { 2559 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2560 } 2561 PetscCallCUDA(WaitForCUDA()); 2562 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2563 cusparsestruct->nonzerostate = A->nonzerostate; 2564 } 2565 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2566 } 2567 PetscFunctionReturn(PETSC_SUCCESS); 2568 } 2569 2570 struct VecCUDAPlusEquals { 2571 template <typename Tuple> 2572 __host__ __device__ void operator()(Tuple t) 2573 { 2574 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2575 } 2576 }; 2577 2578 struct VecCUDAEquals { 2579 template <typename Tuple> 2580 __host__ __device__ void operator()(Tuple t) 2581 { 2582 thrust::get<1>(t) = thrust::get<0>(t); 2583 } 2584 }; 2585 2586 struct VecCUDAEqualsReverse { 2587 template <typename Tuple> 2588 __host__ __device__ void operator()(Tuple t) 2589 { 2590 thrust::get<0>(t) = thrust::get<1>(t); 2591 } 2592 }; 2593 2594 struct MatMatCusparse { 2595 PetscBool cisdense; 2596 PetscScalar *Bt; 2597 Mat X; 2598 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2599 PetscLogDouble flops; 2600 CsrMatrix *Bcsr; 2601 2602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2603 cusparseSpMatDescr_t matSpBDescr; 2604 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2605 cusparseDnMatDescr_t matBDescr; 2606 cusparseDnMatDescr_t matCDescr; 2607 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2608 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2609 void *dBuffer4; 2610 void *dBuffer5; 2611 #endif 2612 size_t mmBufferSize; 2613 void *mmBuffer; 2614 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2615 cusparseSpGEMMDescr_t spgemmDesc; 2616 #endif 2617 }; 2618 2619 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2620 { 2621 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2622 2623 PetscFunctionBegin; 2624 PetscCallCUDA(cudaFree(mmdata->Bt)); 2625 delete mmdata->Bcsr; 2626 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2627 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2628 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2629 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2630 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2632 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2633 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2634 #endif 2635 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2636 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2637 #endif 2638 PetscCall(MatDestroy(&mmdata->X)); 2639 PetscCall(PetscFree(data)); 2640 PetscFunctionReturn(PETSC_SUCCESS); 2641 } 2642 2643 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2644 2645 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2646 { 2647 Mat_Product *product = C->product; 2648 Mat A, B; 2649 PetscInt m, n, blda, clda; 2650 PetscBool flg, biscuda; 2651 Mat_SeqAIJCUSPARSE *cusp; 2652 cusparseStatus_t stat; 2653 cusparseOperation_t opA; 2654 const PetscScalar *barray; 2655 PetscScalar *carray; 2656 MatMatCusparse *mmdata; 2657 Mat_SeqAIJCUSPARSEMultStruct *mat; 2658 CsrMatrix *csrmat; 2659 2660 PetscFunctionBegin; 2661 MatCheckProduct(C, 1); 2662 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2663 mmdata = (MatMatCusparse *)product->data; 2664 A = product->A; 2665 B = product->B; 2666 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2667 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2668 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2669 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2670 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2671 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2672 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2673 switch (product->type) { 2674 case MATPRODUCT_AB: 2675 case MATPRODUCT_PtAP: 2676 mat = cusp->mat; 2677 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2678 m = A->rmap->n; 2679 n = B->cmap->n; 2680 break; 2681 case MATPRODUCT_AtB: 2682 if (!A->form_explicit_transpose) { 2683 mat = cusp->mat; 2684 opA = CUSPARSE_OPERATION_TRANSPOSE; 2685 } else { 2686 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2687 mat = cusp->matTranspose; 2688 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2689 } 2690 m = A->cmap->n; 2691 n = B->cmap->n; 2692 break; 2693 case MATPRODUCT_ABt: 2694 case MATPRODUCT_RARt: 2695 mat = cusp->mat; 2696 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2697 m = A->rmap->n; 2698 n = B->rmap->n; 2699 break; 2700 default: 2701 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2702 } 2703 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2704 csrmat = (CsrMatrix *)mat->mat; 2705 /* if the user passed a CPU matrix, copy the data to the GPU */ 2706 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2707 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2708 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2709 2710 PetscCall(MatDenseGetLDA(B, &blda)); 2711 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2712 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2713 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2714 } else { 2715 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2716 PetscCall(MatDenseGetLDA(C, &clda)); 2717 } 2718 2719 PetscCall(PetscLogGpuTimeBegin()); 2720 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2721 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2722 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2723 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2724 #else 2725 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2726 #endif 2727 2728 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2729 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2730 size_t mmBufferSize; 2731 if (mmdata->initialized && mmdata->Blda != blda) { 2732 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2733 mmdata->matBDescr = NULL; 2734 } 2735 if (!mmdata->matBDescr) { 2736 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2737 mmdata->Blda = blda; 2738 } 2739 2740 if (mmdata->initialized && mmdata->Clda != clda) { 2741 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2742 mmdata->matCDescr = NULL; 2743 } 2744 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2745 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2746 mmdata->Clda = clda; 2747 } 2748 2749 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2750 if (matADescr) { 2751 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2752 matADescr = NULL; 2753 } 2754 #endif 2755 2756 if (!matADescr) { 2757 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2758 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2759 PetscCallCUSPARSE(stat); 2760 } 2761 2762 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2763 2764 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2765 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2766 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2767 mmdata->mmBufferSize = mmBufferSize; 2768 } 2769 2770 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0 2771 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2772 #endif 2773 2774 mmdata->initialized = PETSC_TRUE; 2775 } else { 2776 /* to be safe, always update pointers of the mats */ 2777 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2778 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2779 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2780 } 2781 2782 /* do cusparseSpMM, which supports transpose on B */ 2783 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2784 #else 2785 PetscInt k; 2786 /* cusparseXcsrmm does not support transpose on B */ 2787 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2788 cublasHandle_t cublasv2handle; 2789 cublasStatus_t cerr; 2790 2791 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2792 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2793 PetscCallCUBLAS(cerr); 2794 blda = B->cmap->n; 2795 k = B->cmap->n; 2796 } else { 2797 k = B->rmap->n; 2798 } 2799 2800 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2801 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2802 PetscCallCUSPARSE(stat); 2803 #endif 2804 PetscCall(PetscLogGpuTimeEnd()); 2805 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2806 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2807 if (product->type == MATPRODUCT_RARt) { 2808 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2809 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2810 } else if (product->type == MATPRODUCT_PtAP) { 2811 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2812 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2813 } else { 2814 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2815 } 2816 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2817 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2818 PetscFunctionReturn(PETSC_SUCCESS); 2819 } 2820 2821 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2822 { 2823 Mat_Product *product = C->product; 2824 Mat A, B; 2825 PetscInt m, n; 2826 PetscBool cisdense, flg; 2827 MatMatCusparse *mmdata; 2828 Mat_SeqAIJCUSPARSE *cusp; 2829 2830 PetscFunctionBegin; 2831 MatCheckProduct(C, 1); 2832 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2833 A = product->A; 2834 B = product->B; 2835 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2836 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2837 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2838 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2839 switch (product->type) { 2840 case MATPRODUCT_AB: 2841 m = A->rmap->n; 2842 n = B->cmap->n; 2843 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2844 break; 2845 case MATPRODUCT_AtB: 2846 m = A->cmap->n; 2847 n = B->cmap->n; 2848 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 2849 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2850 break; 2851 case MATPRODUCT_ABt: 2852 m = A->rmap->n; 2853 n = B->rmap->n; 2854 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 2855 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2856 break; 2857 case MATPRODUCT_PtAP: 2858 m = B->cmap->n; 2859 n = B->cmap->n; 2860 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 2861 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2862 break; 2863 case MATPRODUCT_RARt: 2864 m = B->rmap->n; 2865 n = B->rmap->n; 2866 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 2867 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2868 break; 2869 default: 2870 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2871 } 2872 PetscCall(MatSetSizes(C, m, n, m, n)); 2873 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2874 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2875 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2876 2877 /* product data */ 2878 PetscCall(PetscNew(&mmdata)); 2879 mmdata->cisdense = cisdense; 2880 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2881 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2882 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2883 #endif 2884 /* for these products we need intermediate storage */ 2885 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2886 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2887 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2888 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2889 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2890 } else { 2891 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2892 } 2893 } 2894 C->product->data = mmdata; 2895 C->product->destroy = MatDestroy_MatMatCusparse; 2896 2897 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2898 PetscFunctionReturn(PETSC_SUCCESS); 2899 } 2900 2901 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2902 { 2903 Mat_Product *product = C->product; 2904 Mat A, B; 2905 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2906 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2907 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2908 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2909 PetscBool flg; 2910 cusparseStatus_t stat; 2911 MatProductType ptype; 2912 MatMatCusparse *mmdata; 2913 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2914 cusparseSpMatDescr_t BmatSpDescr; 2915 #endif 2916 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2917 2918 PetscFunctionBegin; 2919 MatCheckProduct(C, 1); 2920 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2921 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2922 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2923 mmdata = (MatMatCusparse *)C->product->data; 2924 A = product->A; 2925 B = product->B; 2926 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2927 mmdata->reusesym = PETSC_FALSE; 2928 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2929 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2930 Cmat = Ccusp->mat; 2931 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2932 Ccsr = (CsrMatrix *)Cmat->mat; 2933 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2934 goto finalize; 2935 } 2936 if (!c->nz) goto finalize; 2937 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2938 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2939 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2940 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2941 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2942 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2943 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2944 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2945 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2946 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2947 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2948 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2949 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2950 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2951 2952 ptype = product->type; 2953 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2954 ptype = MATPRODUCT_AB; 2955 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2956 } 2957 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2958 ptype = MATPRODUCT_AB; 2959 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2960 } 2961 switch (ptype) { 2962 case MATPRODUCT_AB: 2963 Amat = Acusp->mat; 2964 Bmat = Bcusp->mat; 2965 break; 2966 case MATPRODUCT_AtB: 2967 Amat = Acusp->matTranspose; 2968 Bmat = Bcusp->mat; 2969 break; 2970 case MATPRODUCT_ABt: 2971 Amat = Acusp->mat; 2972 Bmat = Bcusp->matTranspose; 2973 break; 2974 default: 2975 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2976 } 2977 Cmat = Ccusp->mat; 2978 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2979 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2980 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2981 Acsr = (CsrMatrix *)Amat->mat; 2982 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2983 Ccsr = (CsrMatrix *)Cmat->mat; 2984 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2985 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2986 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2987 PetscCall(PetscLogGpuTimeBegin()); 2988 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2989 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2990 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2991 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2992 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2993 PetscCallCUSPARSE(stat); 2994 #else 2995 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2996 PetscCallCUSPARSE(stat); 2997 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2998 PetscCallCUSPARSE(stat); 2999 #endif 3000 #else 3001 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3002 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3003 PetscCallCUSPARSE(stat); 3004 #endif 3005 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3006 PetscCallCUDA(WaitForCUDA()); 3007 PetscCall(PetscLogGpuTimeEnd()); 3008 C->offloadmask = PETSC_OFFLOAD_GPU; 3009 finalize: 3010 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3011 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3012 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3013 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3014 c->reallocs = 0; 3015 C->info.mallocs += 0; 3016 C->info.nz_unneeded = 0; 3017 C->assembled = C->was_assembled = PETSC_TRUE; 3018 C->num_ass++; 3019 PetscFunctionReturn(PETSC_SUCCESS); 3020 } 3021 3022 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3023 { 3024 Mat_Product *product = C->product; 3025 Mat A, B; 3026 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3027 Mat_SeqAIJ *a, *b, *c; 3028 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3029 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3030 PetscInt i, j, m, n, k; 3031 PetscBool flg; 3032 cusparseStatus_t stat; 3033 MatProductType ptype; 3034 MatMatCusparse *mmdata; 3035 PetscLogDouble flops; 3036 PetscBool biscompressed, ciscompressed; 3037 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3038 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3039 cusparseSpMatDescr_t BmatSpDescr; 3040 #else 3041 int cnz; 3042 #endif 3043 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3044 3045 PetscFunctionBegin; 3046 MatCheckProduct(C, 1); 3047 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3048 A = product->A; 3049 B = product->B; 3050 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3051 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3052 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3053 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3054 a = (Mat_SeqAIJ *)A->data; 3055 b = (Mat_SeqAIJ *)B->data; 3056 /* product data */ 3057 PetscCall(PetscNew(&mmdata)); 3058 C->product->data = mmdata; 3059 C->product->destroy = MatDestroy_MatMatCusparse; 3060 3061 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3062 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3063 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3064 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3065 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3066 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3067 3068 ptype = product->type; 3069 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3070 ptype = MATPRODUCT_AB; 3071 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3072 } 3073 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3074 ptype = MATPRODUCT_AB; 3075 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3076 } 3077 biscompressed = PETSC_FALSE; 3078 ciscompressed = PETSC_FALSE; 3079 switch (ptype) { 3080 case MATPRODUCT_AB: 3081 m = A->rmap->n; 3082 n = B->cmap->n; 3083 k = A->cmap->n; 3084 Amat = Acusp->mat; 3085 Bmat = Bcusp->mat; 3086 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3087 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3088 break; 3089 case MATPRODUCT_AtB: 3090 m = A->cmap->n; 3091 n = B->cmap->n; 3092 k = A->rmap->n; 3093 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3094 Amat = Acusp->matTranspose; 3095 Bmat = Bcusp->mat; 3096 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3097 break; 3098 case MATPRODUCT_ABt: 3099 m = A->rmap->n; 3100 n = B->rmap->n; 3101 k = A->cmap->n; 3102 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3103 Amat = Acusp->mat; 3104 Bmat = Bcusp->matTranspose; 3105 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3106 break; 3107 default: 3108 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3109 } 3110 3111 /* create cusparse matrix */ 3112 PetscCall(MatSetSizes(C, m, n, m, n)); 3113 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3114 c = (Mat_SeqAIJ *)C->data; 3115 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3116 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3117 Ccsr = new CsrMatrix; 3118 3119 c->compressedrow.use = ciscompressed; 3120 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3121 c->compressedrow.nrows = a->compressedrow.nrows; 3122 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3123 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3124 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3125 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3126 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3127 } else { 3128 c->compressedrow.nrows = 0; 3129 c->compressedrow.i = NULL; 3130 c->compressedrow.rindex = NULL; 3131 Ccusp->workVector = NULL; 3132 Cmat->cprowIndices = NULL; 3133 } 3134 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3135 Ccusp->mat = Cmat; 3136 Ccusp->mat->mat = Ccsr; 3137 Ccsr->num_rows = Ccusp->nrows; 3138 Ccsr->num_cols = n; 3139 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3140 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3141 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3142 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3143 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3144 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3145 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3146 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3147 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3148 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3149 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3150 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3151 c->nz = 0; 3152 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3153 Ccsr->values = new THRUSTARRAY(c->nz); 3154 goto finalizesym; 3155 } 3156 3157 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3158 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3159 Acsr = (CsrMatrix *)Amat->mat; 3160 if (!biscompressed) { 3161 Bcsr = (CsrMatrix *)Bmat->mat; 3162 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3163 BmatSpDescr = Bmat->matDescr; 3164 #endif 3165 } else { /* we need to use row offsets for the full matrix */ 3166 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3167 Bcsr = new CsrMatrix; 3168 Bcsr->num_rows = B->rmap->n; 3169 Bcsr->num_cols = cBcsr->num_cols; 3170 Bcsr->num_entries = cBcsr->num_entries; 3171 Bcsr->column_indices = cBcsr->column_indices; 3172 Bcsr->values = cBcsr->values; 3173 if (!Bcusp->rowoffsets_gpu) { 3174 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3175 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3176 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3177 } 3178 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3179 mmdata->Bcsr = Bcsr; 3180 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3181 if (Bcsr->num_rows && Bcsr->num_cols) { 3182 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3183 PetscCallCUSPARSE(stat); 3184 } 3185 BmatSpDescr = mmdata->matSpBDescr; 3186 #endif 3187 } 3188 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3189 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3190 /* precompute flops count */ 3191 if (ptype == MATPRODUCT_AB) { 3192 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3193 const PetscInt st = a->i[i]; 3194 const PetscInt en = a->i[i + 1]; 3195 for (j = st; j < en; j++) { 3196 const PetscInt brow = a->j[j]; 3197 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3198 } 3199 } 3200 } else if (ptype == MATPRODUCT_AtB) { 3201 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3202 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3203 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3204 flops += (2. * anzi) * bnzi; 3205 } 3206 } else { /* TODO */ 3207 flops = 0.; 3208 } 3209 3210 mmdata->flops = flops; 3211 PetscCall(PetscLogGpuTimeBegin()); 3212 3213 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3214 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3215 // cuda-12.2 requires non-null csrRowOffsets 3216 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3217 PetscCallCUSPARSE(stat); 3218 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3219 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3220 { 3221 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3222 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3223 */ 3224 void *dBuffer1 = NULL; 3225 void *dBuffer2 = NULL; 3226 void *dBuffer3 = NULL; 3227 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3228 size_t bufferSize1 = 0; 3229 size_t bufferSize2 = 0; 3230 size_t bufferSize3 = 0; 3231 size_t bufferSize4 = 0; 3232 size_t bufferSize5 = 0; 3233 3234 /* ask bufferSize1 bytes for external memory */ 3235 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3236 PetscCallCUSPARSE(stat); 3237 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3238 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3239 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3240 PetscCallCUSPARSE(stat); 3241 3242 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3243 PetscCallCUSPARSE(stat); 3244 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3245 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3246 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3247 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3248 PetscCallCUSPARSE(stat); 3249 PetscCallCUDA(cudaFree(dBuffer1)); 3250 PetscCallCUDA(cudaFree(dBuffer2)); 3251 3252 /* get matrix C non-zero entries C_nnz1 */ 3253 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3254 c->nz = (PetscInt)C_nnz1; 3255 /* allocate matrix C */ 3256 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3257 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3258 Ccsr->values = new THRUSTARRAY(c->nz); 3259 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3260 /* update matC with the new pointers */ 3261 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3262 PetscCallCUSPARSE(stat); 3263 3264 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3265 PetscCallCUSPARSE(stat); 3266 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3267 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3268 PetscCallCUSPARSE(stat); 3269 PetscCallCUDA(cudaFree(dBuffer3)); 3270 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3271 PetscCallCUSPARSE(stat); 3272 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3273 } 3274 #else 3275 size_t bufSize2; 3276 /* ask bufferSize bytes for external memory */ 3277 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3278 PetscCallCUSPARSE(stat); 3279 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3280 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3281 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3282 PetscCallCUSPARSE(stat); 3283 /* ask bufferSize again bytes for external memory */ 3284 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3285 PetscCallCUSPARSE(stat); 3286 /* The CUSPARSE documentation is not clear, nor the API 3287 We need both buffers to perform the operations properly! 3288 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3289 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3290 is stored in the descriptor! What a messy API... */ 3291 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3292 /* compute the intermediate product of A * B */ 3293 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3294 PetscCallCUSPARSE(stat); 3295 /* get matrix C non-zero entries C_nnz1 */ 3296 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3297 c->nz = (PetscInt)C_nnz1; 3298 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3299 mmdata->mmBufferSize / 1024)); 3300 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3301 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3302 Ccsr->values = new THRUSTARRAY(c->nz); 3303 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3304 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3305 PetscCallCUSPARSE(stat); 3306 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3307 PetscCallCUSPARSE(stat); 3308 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3309 #else 3310 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3311 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3312 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3313 PetscCallCUSPARSE(stat); 3314 c->nz = cnz; 3315 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3316 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3317 Ccsr->values = new THRUSTARRAY(c->nz); 3318 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3319 3320 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3321 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3322 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3323 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3324 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3325 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3326 PetscCallCUSPARSE(stat); 3327 #endif 3328 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3329 PetscCall(PetscLogGpuTimeEnd()); 3330 finalizesym: 3331 c->free_a = PETSC_TRUE; 3332 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3333 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3334 c->free_ij = PETSC_TRUE; 3335 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3336 PetscInt *d_i = c->i; 3337 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3338 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3339 ii = *Ccsr->row_offsets; 3340 jj = *Ccsr->column_indices; 3341 if (ciscompressed) d_i = c->compressedrow.i; 3342 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3343 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3344 } else { 3345 PetscInt *d_i = c->i; 3346 if (ciscompressed) d_i = c->compressedrow.i; 3347 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3348 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3349 } 3350 if (ciscompressed) { /* need to expand host row offsets */ 3351 PetscInt r = 0; 3352 c->i[0] = 0; 3353 for (k = 0; k < c->compressedrow.nrows; k++) { 3354 const PetscInt next = c->compressedrow.rindex[k]; 3355 const PetscInt old = c->compressedrow.i[k]; 3356 for (; r < next; r++) c->i[r + 1] = old; 3357 } 3358 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3359 } 3360 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3361 PetscCall(PetscMalloc1(m, &c->ilen)); 3362 PetscCall(PetscMalloc1(m, &c->imax)); 3363 c->maxnz = c->nz; 3364 c->nonzerorowcnt = 0; 3365 c->rmax = 0; 3366 for (k = 0; k < m; k++) { 3367 const PetscInt nn = c->i[k + 1] - c->i[k]; 3368 c->ilen[k] = c->imax[k] = nn; 3369 c->nonzerorowcnt += (PetscInt)!!nn; 3370 c->rmax = PetscMax(c->rmax, nn); 3371 } 3372 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3373 PetscCall(PetscMalloc1(c->nz, &c->a)); 3374 Ccsr->num_entries = c->nz; 3375 3376 C->nonzerostate++; 3377 PetscCall(PetscLayoutSetUp(C->rmap)); 3378 PetscCall(PetscLayoutSetUp(C->cmap)); 3379 Ccusp->nonzerostate = C->nonzerostate; 3380 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3381 C->preallocated = PETSC_TRUE; 3382 C->assembled = PETSC_FALSE; 3383 C->was_assembled = PETSC_FALSE; 3384 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3385 mmdata->reusesym = PETSC_TRUE; 3386 C->offloadmask = PETSC_OFFLOAD_GPU; 3387 } 3388 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3389 PetscFunctionReturn(PETSC_SUCCESS); 3390 } 3391 3392 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3393 3394 /* handles sparse or dense B */ 3395 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3396 { 3397 Mat_Product *product = mat->product; 3398 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3399 3400 PetscFunctionBegin; 3401 MatCheckProduct(mat, 1); 3402 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3403 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3404 if (product->type == MATPRODUCT_ABC) { 3405 Ciscusp = PETSC_FALSE; 3406 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3407 } 3408 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3409 PetscBool usecpu = PETSC_FALSE; 3410 switch (product->type) { 3411 case MATPRODUCT_AB: 3412 if (product->api_user) { 3413 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3414 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3415 PetscOptionsEnd(); 3416 } else { 3417 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3418 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3419 PetscOptionsEnd(); 3420 } 3421 break; 3422 case MATPRODUCT_AtB: 3423 if (product->api_user) { 3424 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3425 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3426 PetscOptionsEnd(); 3427 } else { 3428 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3429 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3430 PetscOptionsEnd(); 3431 } 3432 break; 3433 case MATPRODUCT_PtAP: 3434 if (product->api_user) { 3435 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3436 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3437 PetscOptionsEnd(); 3438 } else { 3439 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3440 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3441 PetscOptionsEnd(); 3442 } 3443 break; 3444 case MATPRODUCT_RARt: 3445 if (product->api_user) { 3446 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3447 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3448 PetscOptionsEnd(); 3449 } else { 3450 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3451 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3452 PetscOptionsEnd(); 3453 } 3454 break; 3455 case MATPRODUCT_ABC: 3456 if (product->api_user) { 3457 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3458 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3459 PetscOptionsEnd(); 3460 } else { 3461 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3462 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3463 PetscOptionsEnd(); 3464 } 3465 break; 3466 default: 3467 break; 3468 } 3469 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3470 } 3471 /* dispatch */ 3472 if (isdense) { 3473 switch (product->type) { 3474 case MATPRODUCT_AB: 3475 case MATPRODUCT_AtB: 3476 case MATPRODUCT_ABt: 3477 case MATPRODUCT_PtAP: 3478 case MATPRODUCT_RARt: 3479 if (product->A->boundtocpu) { 3480 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3481 } else { 3482 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3483 } 3484 break; 3485 case MATPRODUCT_ABC: 3486 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3487 break; 3488 default: 3489 break; 3490 } 3491 } else if (Biscusp && Ciscusp) { 3492 switch (product->type) { 3493 case MATPRODUCT_AB: 3494 case MATPRODUCT_AtB: 3495 case MATPRODUCT_ABt: 3496 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3497 break; 3498 case MATPRODUCT_PtAP: 3499 case MATPRODUCT_RARt: 3500 case MATPRODUCT_ABC: 3501 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3502 break; 3503 default: 3504 break; 3505 } 3506 } else { /* fallback for AIJ */ 3507 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3508 } 3509 PetscFunctionReturn(PETSC_SUCCESS); 3510 } 3511 3512 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3513 { 3514 PetscFunctionBegin; 3515 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3516 PetscFunctionReturn(PETSC_SUCCESS); 3517 } 3518 3519 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3520 { 3521 PetscFunctionBegin; 3522 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3523 PetscFunctionReturn(PETSC_SUCCESS); 3524 } 3525 3526 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3527 { 3528 PetscFunctionBegin; 3529 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3530 PetscFunctionReturn(PETSC_SUCCESS); 3531 } 3532 3533 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3534 { 3535 PetscFunctionBegin; 3536 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3537 PetscFunctionReturn(PETSC_SUCCESS); 3538 } 3539 3540 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3541 { 3542 PetscFunctionBegin; 3543 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3544 PetscFunctionReturn(PETSC_SUCCESS); 3545 } 3546 3547 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3548 { 3549 int i = blockIdx.x * blockDim.x + threadIdx.x; 3550 if (i < n) y[idx[i]] += x[i]; 3551 } 3552 3553 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3554 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3555 { 3556 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3557 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3558 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3559 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3560 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3561 PetscBool compressed; 3562 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3563 PetscInt nx, ny; 3564 #endif 3565 3566 PetscFunctionBegin; 3567 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3568 if (!a->nz) { 3569 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3570 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3571 PetscFunctionReturn(PETSC_SUCCESS); 3572 } 3573 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3574 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3575 if (!trans) { 3576 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3577 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3578 } else { 3579 if (herm || !A->form_explicit_transpose) { 3580 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3581 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3582 } else { 3583 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3584 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3585 } 3586 } 3587 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3588 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3589 3590 try { 3591 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3592 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3593 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3594 3595 PetscCall(PetscLogGpuTimeBegin()); 3596 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3597 /* z = A x + beta y. 3598 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3599 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3600 */ 3601 xptr = xarray; 3602 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3603 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3604 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3605 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3606 allocated to accommodate different uses. So we get the length info directly from mat. 3607 */ 3608 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3609 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3610 nx = mat->num_cols; // since y = Ax 3611 ny = mat->num_rows; 3612 } 3613 #endif 3614 } else { 3615 /* z = A^T x + beta y 3616 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3617 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3618 */ 3619 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3620 dptr = zarray; 3621 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3622 if (compressed) { /* Scatter x to work vector */ 3623 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3624 3625 thrust::for_each( 3626 #if PetscDefined(HAVE_THRUST_ASYNC) 3627 thrust::cuda::par.on(PetscDefaultCudaStream), 3628 #endif 3629 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3630 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3631 } 3632 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3633 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3634 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3635 nx = mat->num_rows; // since y = A^T x 3636 ny = mat->num_cols; 3637 } 3638 #endif 3639 } 3640 3641 /* csr_spmv does y = alpha op(A) x + beta y */ 3642 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3643 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3644 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3645 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3646 #else 3647 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3648 #endif 3649 3650 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3651 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3652 if (!matDescr) { 3653 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3654 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3655 } 3656 #endif 3657 3658 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3659 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3660 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3661 PetscCallCUSPARSE( 3662 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3663 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3664 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3665 PetscCallCUSPARSE( 3666 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3667 #endif 3668 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3669 } else { 3670 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3671 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3672 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3673 } 3674 3675 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3676 #else 3677 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3678 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3679 #endif 3680 } else { 3681 if (cusparsestruct->nrows) { 3682 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3683 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3684 #else 3685 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3686 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3687 #endif 3688 } 3689 } 3690 PetscCall(PetscLogGpuTimeEnd()); 3691 3692 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3693 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3694 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3695 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3696 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3697 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3698 } 3699 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3700 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3701 } 3702 3703 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3704 if (compressed) { 3705 PetscCall(PetscLogGpuTimeBegin()); 3706 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3707 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3708 prevent that. So I just add a ScatterAdd kernel. 3709 */ 3710 #if 0 3711 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3712 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3713 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3714 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3715 VecCUDAPlusEquals()); 3716 #else 3717 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3718 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3719 #endif 3720 PetscCall(PetscLogGpuTimeEnd()); 3721 } 3722 } else { 3723 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3724 } 3725 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3726 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3727 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3728 } catch (char *ex) { 3729 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3730 } 3731 if (yy) { 3732 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3733 } else { 3734 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3735 } 3736 PetscFunctionReturn(PETSC_SUCCESS); 3737 } 3738 3739 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3740 { 3741 PetscFunctionBegin; 3742 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3743 PetscFunctionReturn(PETSC_SUCCESS); 3744 } 3745 3746 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3747 { 3748 PetscFunctionBegin; 3749 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3750 PetscFunctionReturn(PETSC_SUCCESS); 3751 } 3752 3753 /*@ 3754 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3755 (the default parallel PETSc format). 3756 3757 Collective 3758 3759 Input Parameters: 3760 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3761 . m - number of rows 3762 . n - number of columns 3763 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3764 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3765 3766 Output Parameter: 3767 . A - the matrix 3768 3769 Level: intermediate 3770 3771 Notes: 3772 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3773 calculations. For good matrix assembly performance the user should preallocate the matrix 3774 storage by setting the parameter `nz` (or the array `nnz`). 3775 3776 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3777 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3778 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3779 3780 The AIJ format, also called 3781 compressed row storage, is fully compatible with standard Fortran 3782 storage. That is, the stored row and column indices can begin at 3783 either one (as in Fortran) or zero. 3784 3785 Specify the preallocated storage with either nz or nnz (not both). 3786 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3787 allocation. 3788 3789 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 3790 @*/ 3791 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3792 { 3793 PetscFunctionBegin; 3794 PetscCall(MatCreate(comm, A)); 3795 PetscCall(MatSetSizes(*A, m, n, m, n)); 3796 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3797 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3798 PetscFunctionReturn(PETSC_SUCCESS); 3799 } 3800 3801 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3802 { 3803 PetscFunctionBegin; 3804 if (A->factortype == MAT_FACTOR_NONE) { 3805 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3806 } else { 3807 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3808 } 3809 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3810 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3811 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3812 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3813 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3814 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3815 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3816 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3817 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3818 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3819 PetscCall(MatDestroy_SeqAIJ(A)); 3820 PetscFunctionReturn(PETSC_SUCCESS); 3821 } 3822 3823 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3824 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3825 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3826 { 3827 PetscFunctionBegin; 3828 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3829 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3830 PetscFunctionReturn(PETSC_SUCCESS); 3831 } 3832 3833 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3834 { 3835 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3836 Mat_SeqAIJCUSPARSE *cy; 3837 Mat_SeqAIJCUSPARSE *cx; 3838 PetscScalar *ay; 3839 const PetscScalar *ax; 3840 CsrMatrix *csry, *csrx; 3841 3842 PetscFunctionBegin; 3843 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3844 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3845 if (X->ops->axpy != Y->ops->axpy) { 3846 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3847 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3848 PetscFunctionReturn(PETSC_SUCCESS); 3849 } 3850 /* if we are here, it means both matrices are bound to GPU */ 3851 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3852 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3853 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3854 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3855 csry = (CsrMatrix *)cy->mat->mat; 3856 csrx = (CsrMatrix *)cx->mat->mat; 3857 /* see if we can turn this into a cublas axpy */ 3858 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3859 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3860 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3861 if (eq) str = SAME_NONZERO_PATTERN; 3862 } 3863 /* spgeam is buggy with one column */ 3864 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3865 3866 if (str == SUBSET_NONZERO_PATTERN) { 3867 PetscScalar b = 1.0; 3868 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3869 size_t bufferSize; 3870 void *buffer; 3871 #endif 3872 3873 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3874 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3875 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3876 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3877 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3878 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3879 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3880 PetscCall(PetscLogGpuTimeBegin()); 3881 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3882 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3883 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3884 PetscCall(PetscLogGpuTimeEnd()); 3885 PetscCallCUDA(cudaFree(buffer)); 3886 #else 3887 PetscCall(PetscLogGpuTimeBegin()); 3888 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3889 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3890 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3891 PetscCall(PetscLogGpuTimeEnd()); 3892 #endif 3893 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3894 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3895 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3896 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3897 } else if (str == SAME_NONZERO_PATTERN) { 3898 cublasHandle_t cublasv2handle; 3899 PetscBLASInt one = 1, bnz = 1; 3900 3901 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3902 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3903 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3904 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3905 PetscCall(PetscLogGpuTimeBegin()); 3906 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3907 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3908 PetscCall(PetscLogGpuTimeEnd()); 3909 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3910 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3911 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3912 } else { 3913 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3914 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3915 } 3916 PetscFunctionReturn(PETSC_SUCCESS); 3917 } 3918 3919 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3920 { 3921 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3922 PetscScalar *ay; 3923 cublasHandle_t cublasv2handle; 3924 PetscBLASInt one = 1, bnz = 1; 3925 3926 PetscFunctionBegin; 3927 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3928 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3929 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3930 PetscCall(PetscLogGpuTimeBegin()); 3931 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3932 PetscCall(PetscLogGpuFlops(bnz)); 3933 PetscCall(PetscLogGpuTimeEnd()); 3934 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3935 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3936 PetscFunctionReturn(PETSC_SUCCESS); 3937 } 3938 3939 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3940 { 3941 PetscBool both = PETSC_FALSE; 3942 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3943 3944 PetscFunctionBegin; 3945 if (A->factortype == MAT_FACTOR_NONE) { 3946 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3947 if (spptr->mat) { 3948 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3949 if (matrix->values) { 3950 both = PETSC_TRUE; 3951 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3952 } 3953 } 3954 if (spptr->matTranspose) { 3955 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3956 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3957 } 3958 } 3959 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3960 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3961 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3962 else A->offloadmask = PETSC_OFFLOAD_CPU; 3963 PetscFunctionReturn(PETSC_SUCCESS); 3964 } 3965 3966 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3967 { 3968 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3969 3970 PetscFunctionBegin; 3971 if (A->factortype != MAT_FACTOR_NONE) { 3972 A->boundtocpu = flg; 3973 PetscFunctionReturn(PETSC_SUCCESS); 3974 } 3975 if (flg) { 3976 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3977 3978 A->ops->scale = MatScale_SeqAIJ; 3979 A->ops->axpy = MatAXPY_SeqAIJ; 3980 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3981 A->ops->mult = MatMult_SeqAIJ; 3982 A->ops->multadd = MatMultAdd_SeqAIJ; 3983 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3984 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3985 A->ops->multhermitiantranspose = NULL; 3986 A->ops->multhermitiantransposeadd = NULL; 3987 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3988 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3989 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3990 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3991 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3992 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3993 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3994 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3995 } else { 3996 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3997 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3998 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3999 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4000 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4001 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4002 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4003 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4004 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4005 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4006 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4007 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4008 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4009 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4010 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4011 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4012 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4013 4014 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4015 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4016 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4017 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4018 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4019 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4020 } 4021 A->boundtocpu = flg; 4022 if (flg && a->inode.size) { 4023 a->inode.use = PETSC_TRUE; 4024 } else { 4025 a->inode.use = PETSC_FALSE; 4026 } 4027 PetscFunctionReturn(PETSC_SUCCESS); 4028 } 4029 4030 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4031 { 4032 Mat B; 4033 4034 PetscFunctionBegin; 4035 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4036 if (reuse == MAT_INITIAL_MATRIX) { 4037 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4038 } else if (reuse == MAT_REUSE_MATRIX) { 4039 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4040 } 4041 B = *newmat; 4042 4043 PetscCall(PetscFree(B->defaultvectype)); 4044 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4045 4046 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4047 if (B->factortype == MAT_FACTOR_NONE) { 4048 Mat_SeqAIJCUSPARSE *spptr; 4049 PetscCall(PetscNew(&spptr)); 4050 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4051 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4052 spptr->format = MAT_CUSPARSE_CSR; 4053 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4054 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4055 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4056 #else 4057 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4058 #endif 4059 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4060 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4061 #endif 4062 B->spptr = spptr; 4063 } else { 4064 Mat_SeqAIJCUSPARSETriFactors *spptr; 4065 4066 PetscCall(PetscNew(&spptr)); 4067 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4068 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4069 B->spptr = spptr; 4070 } 4071 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4072 } 4073 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4074 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4075 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4076 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4077 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4078 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4079 4080 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4081 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4082 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4083 #if defined(PETSC_HAVE_HYPRE) 4084 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4085 #endif 4086 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4087 PetscFunctionReturn(PETSC_SUCCESS); 4088 } 4089 4090 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4091 { 4092 PetscFunctionBegin; 4093 PetscCall(MatCreate_SeqAIJ(B)); 4094 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4095 PetscFunctionReturn(PETSC_SUCCESS); 4096 } 4097 4098 /*MC 4099 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4100 4101 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 4102 CSR, ELL, or Hybrid format. 4103 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4104 4105 Options Database Keys: 4106 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4107 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4108 Other options include ell (ellpack) or hyb (hybrid). 4109 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4110 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4111 4112 Level: beginner 4113 4114 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4115 M*/ 4116 4117 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4118 { 4119 PetscFunctionBegin; 4120 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4121 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4122 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4123 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4124 PetscFunctionReturn(PETSC_SUCCESS); 4125 } 4126 4127 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4128 { 4129 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4130 4131 PetscFunctionBegin; 4132 if (cusp) { 4133 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4134 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4135 delete cusp->workVector; 4136 delete cusp->rowoffsets_gpu; 4137 delete cusp->csr2csc_i; 4138 delete cusp->coords; 4139 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4140 PetscCall(PetscFree(mat->spptr)); 4141 } 4142 PetscFunctionReturn(PETSC_SUCCESS); 4143 } 4144 4145 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4146 { 4147 PetscFunctionBegin; 4148 if (*mat) { 4149 delete (*mat)->values; 4150 delete (*mat)->column_indices; 4151 delete (*mat)->row_offsets; 4152 delete *mat; 4153 *mat = 0; 4154 } 4155 PetscFunctionReturn(PETSC_SUCCESS); 4156 } 4157 4158 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4159 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4160 { 4161 PetscFunctionBegin; 4162 if (*trifactor) { 4163 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4164 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4165 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4166 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4167 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4168 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4169 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4170 #endif 4171 PetscCall(PetscFree(*trifactor)); 4172 } 4173 PetscFunctionReturn(PETSC_SUCCESS); 4174 } 4175 #endif 4176 4177 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4178 { 4179 CsrMatrix *mat; 4180 4181 PetscFunctionBegin; 4182 if (*matstruct) { 4183 if ((*matstruct)->mat) { 4184 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4185 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4186 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4187 #else 4188 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4189 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4190 #endif 4191 } else { 4192 mat = (CsrMatrix *)(*matstruct)->mat; 4193 PetscCall(CsrMatrix_Destroy(&mat)); 4194 } 4195 } 4196 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4197 delete (*matstruct)->cprowIndices; 4198 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4199 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4200 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4201 4202 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4203 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4204 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4205 4206 for (int i = 0; i < 3; i++) { 4207 if (mdata->cuSpMV[i].initialized) { 4208 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4209 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4210 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4211 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4212 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4213 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4214 #endif 4215 } 4216 } 4217 #endif 4218 delete *matstruct; 4219 *matstruct = NULL; 4220 } 4221 PetscFunctionReturn(PETSC_SUCCESS); 4222 } 4223 4224 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4225 { 4226 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4227 4228 PetscFunctionBegin; 4229 if (fs) { 4230 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4231 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4232 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4233 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4234 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4235 delete fs->workVector; 4236 fs->workVector = NULL; 4237 #endif 4238 delete fs->rpermIndices; 4239 delete fs->cpermIndices; 4240 fs->rpermIndices = NULL; 4241 fs->cpermIndices = NULL; 4242 fs->init_dev_prop = PETSC_FALSE; 4243 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4244 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4245 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4246 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4247 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4248 PetscCallCUDA(cudaFree(fs->csrVal)); 4249 PetscCallCUDA(cudaFree(fs->diag)); 4250 PetscCallCUDA(cudaFree(fs->X)); 4251 PetscCallCUDA(cudaFree(fs->Y)); 4252 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4253 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4254 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4255 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4256 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4257 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4258 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4259 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4260 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4261 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4262 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4263 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4264 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4265 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4266 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4267 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4268 PetscCall(PetscFree(fs->csrRowPtr_h)); 4269 PetscCall(PetscFree(fs->csrVal_h)); 4270 PetscCall(PetscFree(fs->diag_h)); 4271 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4272 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4273 #endif 4274 } 4275 PetscFunctionReturn(PETSC_SUCCESS); 4276 } 4277 4278 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4279 { 4280 PetscFunctionBegin; 4281 if (*trifactors) { 4282 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4283 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4284 PetscCall(PetscFree(*trifactors)); 4285 } 4286 PetscFunctionReturn(PETSC_SUCCESS); 4287 } 4288 4289 struct IJCompare { 4290 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4291 { 4292 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4293 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4294 return false; 4295 } 4296 }; 4297 4298 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4299 { 4300 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4301 4302 PetscFunctionBegin; 4303 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4304 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4305 if (destroy) { 4306 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4307 delete cusp->csr2csc_i; 4308 cusp->csr2csc_i = NULL; 4309 } 4310 A->transupdated = PETSC_FALSE; 4311 PetscFunctionReturn(PETSC_SUCCESS); 4312 } 4313 4314 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4315 { 4316 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 4317 4318 PetscFunctionBegin; 4319 PetscCallCUDA(cudaFree(coo->perm)); 4320 PetscCallCUDA(cudaFree(coo->jmap)); 4321 PetscCall(PetscFree(coo)); 4322 PetscFunctionReturn(PETSC_SUCCESS); 4323 } 4324 4325 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4326 { 4327 PetscBool dev_ij = PETSC_FALSE; 4328 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4329 PetscInt *i, *j; 4330 PetscContainer container_h; 4331 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4332 4333 PetscFunctionBegin; 4334 // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter 4335 PetscCall(PetscGetMemType(coo_i, &mtype)); 4336 if (PetscMemTypeDevice(mtype)) { 4337 dev_ij = PETSC_TRUE; 4338 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4339 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4340 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4341 } else { 4342 i = coo_i; 4343 j = coo_j; 4344 } 4345 4346 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4347 if (dev_ij) PetscCall(PetscFree2(i, j)); 4348 mat->offloadmask = PETSC_OFFLOAD_CPU; 4349 // Create the GPU memory 4350 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4351 4352 // Copy the COO struct to device 4353 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4354 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4355 PetscCall(PetscMalloc1(1, &coo_d)); 4356 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4357 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4358 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4359 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4360 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4361 4362 // Put the COO struct in a container and then attach that to the matrix 4363 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4364 PetscFunctionReturn(PETSC_SUCCESS); 4365 } 4366 4367 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4368 { 4369 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4370 const PetscCount grid_size = gridDim.x * blockDim.x; 4371 for (; i < nnz; i += grid_size) { 4372 PetscScalar sum = 0.0; 4373 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4374 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4375 } 4376 } 4377 4378 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4379 { 4380 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4381 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4382 PetscCount Annz = seq->nz; 4383 PetscMemType memtype; 4384 const PetscScalar *v1 = v; 4385 PetscScalar *Aa; 4386 PetscContainer container; 4387 MatCOOStruct_SeqAIJ *coo; 4388 4389 PetscFunctionBegin; 4390 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4391 4392 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4393 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4394 4395 PetscCall(PetscGetMemType(v, &memtype)); 4396 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4397 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4398 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4399 } 4400 4401 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4402 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4403 4404 PetscCall(PetscLogGpuTimeBegin()); 4405 if (Annz) { 4406 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4407 PetscCallCUDA(cudaPeekAtLastError()); 4408 } 4409 PetscCall(PetscLogGpuTimeEnd()); 4410 4411 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4412 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4413 4414 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4415 PetscFunctionReturn(PETSC_SUCCESS); 4416 } 4417 4418 /*@C 4419 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4420 4421 Not Collective 4422 4423 Input Parameters: 4424 + A - the matrix 4425 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4426 4427 Output Parameters: 4428 + i - the CSR row pointers 4429 - j - the CSR column indices 4430 4431 Level: developer 4432 4433 Note: 4434 When compressed is true, the CSR structure does not contain empty rows 4435 4436 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4437 @*/ 4438 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4439 { 4440 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4441 CsrMatrix *csr; 4442 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4443 4444 PetscFunctionBegin; 4445 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4446 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4447 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4448 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4449 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4450 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4451 csr = (CsrMatrix *)cusp->mat->mat; 4452 if (i) { 4453 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4454 if (!cusp->rowoffsets_gpu) { 4455 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4456 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4457 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4458 } 4459 *i = cusp->rowoffsets_gpu->data().get(); 4460 } else *i = csr->row_offsets->data().get(); 4461 } 4462 if (j) *j = csr->column_indices->data().get(); 4463 PetscFunctionReturn(PETSC_SUCCESS); 4464 } 4465 4466 /*@C 4467 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4468 4469 Not Collective 4470 4471 Input Parameters: 4472 + A - the matrix 4473 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4474 . i - the CSR row pointers 4475 - j - the CSR column indices 4476 4477 Level: developer 4478 4479 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4480 @*/ 4481 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4482 { 4483 PetscFunctionBegin; 4484 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4485 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4486 if (i) *i = NULL; 4487 if (j) *j = NULL; 4488 (void)compressed; 4489 PetscFunctionReturn(PETSC_SUCCESS); 4490 } 4491 4492 /*@C 4493 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4494 4495 Not Collective 4496 4497 Input Parameter: 4498 . A - a `MATSEQAIJCUSPARSE` matrix 4499 4500 Output Parameter: 4501 . a - pointer to the device data 4502 4503 Level: developer 4504 4505 Note: 4506 May trigger host-device copies if up-to-date matrix data is on host 4507 4508 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4509 @*/ 4510 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4511 { 4512 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4513 CsrMatrix *csr; 4514 4515 PetscFunctionBegin; 4516 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4517 PetscAssertPointer(a, 2); 4518 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4519 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4520 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4521 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4522 csr = (CsrMatrix *)cusp->mat->mat; 4523 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4524 *a = csr->values->data().get(); 4525 PetscFunctionReturn(PETSC_SUCCESS); 4526 } 4527 4528 /*@C 4529 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4530 4531 Not Collective 4532 4533 Input Parameters: 4534 + A - a `MATSEQAIJCUSPARSE` matrix 4535 - a - pointer to the device data 4536 4537 Level: developer 4538 4539 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4540 @*/ 4541 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4542 { 4543 PetscFunctionBegin; 4544 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4545 PetscAssertPointer(a, 2); 4546 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4547 *a = NULL; 4548 PetscFunctionReturn(PETSC_SUCCESS); 4549 } 4550 4551 /*@C 4552 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4553 4554 Not Collective 4555 4556 Input Parameter: 4557 . A - a `MATSEQAIJCUSPARSE` matrix 4558 4559 Output Parameter: 4560 . a - pointer to the device data 4561 4562 Level: developer 4563 4564 Note: 4565 May trigger host-device copies if up-to-date matrix data is on host 4566 4567 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4568 @*/ 4569 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4570 { 4571 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4572 CsrMatrix *csr; 4573 4574 PetscFunctionBegin; 4575 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4576 PetscAssertPointer(a, 2); 4577 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4578 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4579 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4580 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4581 csr = (CsrMatrix *)cusp->mat->mat; 4582 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4583 *a = csr->values->data().get(); 4584 A->offloadmask = PETSC_OFFLOAD_GPU; 4585 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4586 PetscFunctionReturn(PETSC_SUCCESS); 4587 } 4588 /*@C 4589 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4590 4591 Not Collective 4592 4593 Input Parameters: 4594 + A - a `MATSEQAIJCUSPARSE` matrix 4595 - a - pointer to the device data 4596 4597 Level: developer 4598 4599 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4600 @*/ 4601 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4602 { 4603 PetscFunctionBegin; 4604 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4605 PetscAssertPointer(a, 2); 4606 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4607 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4608 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4609 *a = NULL; 4610 PetscFunctionReturn(PETSC_SUCCESS); 4611 } 4612 4613 /*@C 4614 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4615 4616 Not Collective 4617 4618 Input Parameter: 4619 . A - a `MATSEQAIJCUSPARSE` matrix 4620 4621 Output Parameter: 4622 . a - pointer to the device data 4623 4624 Level: developer 4625 4626 Note: 4627 Does not trigger host-device copies and flags data validity on the GPU 4628 4629 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4630 @*/ 4631 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4632 { 4633 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4634 CsrMatrix *csr; 4635 4636 PetscFunctionBegin; 4637 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4638 PetscAssertPointer(a, 2); 4639 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4640 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4641 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4642 csr = (CsrMatrix *)cusp->mat->mat; 4643 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4644 *a = csr->values->data().get(); 4645 A->offloadmask = PETSC_OFFLOAD_GPU; 4646 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4647 PetscFunctionReturn(PETSC_SUCCESS); 4648 } 4649 4650 /*@C 4651 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4652 4653 Not Collective 4654 4655 Input Parameters: 4656 + A - a `MATSEQAIJCUSPARSE` matrix 4657 - a - pointer to the device data 4658 4659 Level: developer 4660 4661 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4662 @*/ 4663 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4664 { 4665 PetscFunctionBegin; 4666 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4667 PetscAssertPointer(a, 2); 4668 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4669 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4670 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4671 *a = NULL; 4672 PetscFunctionReturn(PETSC_SUCCESS); 4673 } 4674 4675 struct IJCompare4 { 4676 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4677 { 4678 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4679 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4680 return false; 4681 } 4682 }; 4683 4684 struct Shift { 4685 int _shift; 4686 4687 Shift(int shift) : _shift(shift) { } 4688 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4689 }; 4690 4691 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4692 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4693 { 4694 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4695 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4696 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4697 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4698 PetscInt Annz, Bnnz; 4699 cusparseStatus_t stat; 4700 PetscInt i, m, n, zero = 0; 4701 4702 PetscFunctionBegin; 4703 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4704 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4705 PetscAssertPointer(C, 4); 4706 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4707 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4708 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4709 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4710 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4711 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4712 if (reuse == MAT_INITIAL_MATRIX) { 4713 m = A->rmap->n; 4714 n = A->cmap->n + B->cmap->n; 4715 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4716 PetscCall(MatSetSizes(*C, m, n, m, n)); 4717 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4718 c = (Mat_SeqAIJ *)(*C)->data; 4719 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4720 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4721 Ccsr = new CsrMatrix; 4722 Cmat->cprowIndices = NULL; 4723 c->compressedrow.use = PETSC_FALSE; 4724 c->compressedrow.nrows = 0; 4725 c->compressedrow.i = NULL; 4726 c->compressedrow.rindex = NULL; 4727 Ccusp->workVector = NULL; 4728 Ccusp->nrows = m; 4729 Ccusp->mat = Cmat; 4730 Ccusp->mat->mat = Ccsr; 4731 Ccsr->num_rows = m; 4732 Ccsr->num_cols = n; 4733 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4734 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4735 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4736 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4737 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4738 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4739 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4740 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4741 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4742 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4743 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4744 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4745 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4746 4747 Acsr = (CsrMatrix *)Acusp->mat->mat; 4748 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4749 Annz = (PetscInt)Acsr->column_indices->size(); 4750 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4751 c->nz = Annz + Bnnz; 4752 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4753 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4754 Ccsr->values = new THRUSTARRAY(c->nz); 4755 Ccsr->num_entries = c->nz; 4756 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4757 if (c->nz) { 4758 auto Acoo = new THRUSTINTARRAY32(Annz); 4759 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4760 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4761 THRUSTINTARRAY32 *Aroff, *Broff; 4762 4763 if (a->compressedrow.use) { /* need full row offset */ 4764 if (!Acusp->rowoffsets_gpu) { 4765 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4766 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4767 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4768 } 4769 Aroff = Acusp->rowoffsets_gpu; 4770 } else Aroff = Acsr->row_offsets; 4771 if (b->compressedrow.use) { /* need full row offset */ 4772 if (!Bcusp->rowoffsets_gpu) { 4773 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4774 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4775 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4776 } 4777 Broff = Bcusp->rowoffsets_gpu; 4778 } else Broff = Bcsr->row_offsets; 4779 PetscCall(PetscLogGpuTimeBegin()); 4780 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4781 PetscCallCUSPARSE(stat); 4782 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4783 PetscCallCUSPARSE(stat); 4784 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4785 auto Aperm = thrust::make_constant_iterator(1); 4786 auto Bperm = thrust::make_constant_iterator(0); 4787 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4788 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4789 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4790 #else 4791 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4792 auto Bcib = Bcsr->column_indices->begin(); 4793 auto Bcie = Bcsr->column_indices->end(); 4794 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4795 #endif 4796 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4797 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4798 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4799 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4800 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4801 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4802 auto p1 = Ccusp->coords->begin(); 4803 auto p2 = Ccusp->coords->begin(); 4804 thrust::advance(p2, Annz); 4805 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4806 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4807 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4808 #endif 4809 auto cci = thrust::make_counting_iterator(zero); 4810 auto cce = thrust::make_counting_iterator(c->nz); 4811 #if 0 //Errors on SUMMIT cuda 11.1.0 4812 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4813 #else 4814 auto pred = thrust::identity<int>(); 4815 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4816 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4817 #endif 4818 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4819 PetscCallCUSPARSE(stat); 4820 PetscCall(PetscLogGpuTimeEnd()); 4821 delete wPerm; 4822 delete Acoo; 4823 delete Bcoo; 4824 delete Ccoo; 4825 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4826 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4827 PetscCallCUSPARSE(stat); 4828 #endif 4829 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4830 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4831 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4832 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4833 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4834 CsrMatrix *CcsrT = new CsrMatrix; 4835 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4836 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4837 4838 (*C)->form_explicit_transpose = PETSC_TRUE; 4839 (*C)->transupdated = PETSC_TRUE; 4840 Ccusp->rowoffsets_gpu = NULL; 4841 CmatT->cprowIndices = NULL; 4842 CmatT->mat = CcsrT; 4843 CcsrT->num_rows = n; 4844 CcsrT->num_cols = m; 4845 CcsrT->num_entries = c->nz; 4846 4847 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4848 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4849 CcsrT->values = new THRUSTARRAY(c->nz); 4850 4851 PetscCall(PetscLogGpuTimeBegin()); 4852 auto rT = CcsrT->row_offsets->begin(); 4853 if (AT) { 4854 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4855 thrust::advance(rT, -1); 4856 } 4857 if (BT) { 4858 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4859 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4860 thrust::copy(titb, tite, rT); 4861 } 4862 auto cT = CcsrT->column_indices->begin(); 4863 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4864 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4865 auto vT = CcsrT->values->begin(); 4866 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4867 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4868 PetscCall(PetscLogGpuTimeEnd()); 4869 4870 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4871 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4872 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4873 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4874 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4875 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4876 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4877 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4878 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4879 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4880 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4881 PetscCallCUSPARSE(stat); 4882 #endif 4883 Ccusp->matTranspose = CmatT; 4884 } 4885 } 4886 4887 c->free_a = PETSC_TRUE; 4888 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4889 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4890 c->free_ij = PETSC_TRUE; 4891 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4892 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4893 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4894 ii = *Ccsr->row_offsets; 4895 jj = *Ccsr->column_indices; 4896 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4897 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4898 } else { 4899 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4900 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4901 } 4902 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4903 PetscCall(PetscMalloc1(m, &c->ilen)); 4904 PetscCall(PetscMalloc1(m, &c->imax)); 4905 c->maxnz = c->nz; 4906 c->nonzerorowcnt = 0; 4907 c->rmax = 0; 4908 for (i = 0; i < m; i++) { 4909 const PetscInt nn = c->i[i + 1] - c->i[i]; 4910 c->ilen[i] = c->imax[i] = nn; 4911 c->nonzerorowcnt += (PetscInt)!!nn; 4912 c->rmax = PetscMax(c->rmax, nn); 4913 } 4914 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4915 PetscCall(PetscMalloc1(c->nz, &c->a)); 4916 (*C)->nonzerostate++; 4917 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4918 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4919 Ccusp->nonzerostate = (*C)->nonzerostate; 4920 (*C)->preallocated = PETSC_TRUE; 4921 } else { 4922 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4923 c = (Mat_SeqAIJ *)(*C)->data; 4924 if (c->nz) { 4925 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4926 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4927 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4928 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4929 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4930 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4931 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4932 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4933 Acsr = (CsrMatrix *)Acusp->mat->mat; 4934 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4935 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4936 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4937 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4938 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4939 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4940 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4941 auto pmid = Ccusp->coords->begin(); 4942 thrust::advance(pmid, Acsr->num_entries); 4943 PetscCall(PetscLogGpuTimeBegin()); 4944 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4945 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4946 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4947 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4948 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4949 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4950 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4951 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4952 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4953 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4954 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4955 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4956 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4957 auto vT = CcsrT->values->begin(); 4958 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4959 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4960 (*C)->transupdated = PETSC_TRUE; 4961 } 4962 PetscCall(PetscLogGpuTimeEnd()); 4963 } 4964 } 4965 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4966 (*C)->assembled = PETSC_TRUE; 4967 (*C)->was_assembled = PETSC_FALSE; 4968 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4969 PetscFunctionReturn(PETSC_SUCCESS); 4970 } 4971 4972 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4973 { 4974 bool dmem; 4975 const PetscScalar *av; 4976 4977 PetscFunctionBegin; 4978 dmem = isCudaMem(v); 4979 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4980 if (n && idx) { 4981 THRUSTINTARRAY widx(n); 4982 widx.assign(idx, idx + n); 4983 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4984 4985 THRUSTARRAY *w = NULL; 4986 thrust::device_ptr<PetscScalar> dv; 4987 if (dmem) { 4988 dv = thrust::device_pointer_cast(v); 4989 } else { 4990 w = new THRUSTARRAY(n); 4991 dv = w->data(); 4992 } 4993 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4994 4995 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4996 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4997 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4998 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4999 delete w; 5000 } else { 5001 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5002 } 5003 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5004 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5005 PetscFunctionReturn(PETSC_SUCCESS); 5006 } 5007 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 5008