1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 28 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 29 30 typedef enum { 31 CUSPARSE_MV_ALG_DEFAULT = 0, 32 CUSPARSE_COOMV_ALG = 1, 33 CUSPARSE_CSRMV_ALG1 = 2, 34 CUSPARSE_CSRMV_ALG2 = 3 35 } cusparseSpMVAlg_t; 36 37 typedef enum { 38 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 39 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 40 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 41 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 42 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 43 CUSPARSE_SPMM_ALG_DEFAULT = 0, 44 CUSPARSE_SPMM_COO_ALG1 = 1, 45 CUSPARSE_SPMM_COO_ALG2 = 2, 46 CUSPARSE_SPMM_COO_ALG3 = 3, 47 CUSPARSE_SPMM_COO_ALG4 = 5, 48 CUSPARSE_SPMM_CSR_ALG1 = 4, 49 CUSPARSE_SPMM_CSR_ALG2 = 6, 50 } cusparseSpMMAlg_t; 51 52 typedef enum { 53 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 54 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 55 } cusparseCsr2CscAlg_t; 56 */ 57 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 58 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 60 #endif 61 62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 66 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 72 #endif 73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 83 84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 88 89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 91 92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 95 96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 97 { 98 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 99 100 PetscFunctionBegin; 101 switch (op) { 102 case MAT_CUSPARSE_MULT: 103 cusparsestruct->format = format; 104 break; 105 case MAT_CUSPARSE_ALL: 106 cusparsestruct->format = format; 107 break; 108 default: 109 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 110 } 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 /*@ 115 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 116 operation. Only the `MatMult()` operation can use different GPU storage formats 117 118 Not Collective 119 120 Input Parameters: 121 + A - Matrix of type `MATSEQAIJCUSPARSE` 122 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 123 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 124 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 125 126 Level: intermediate 127 128 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129 @*/ 130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 131 { 132 PetscFunctionBegin; 133 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 134 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 139 { 140 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 141 142 PetscFunctionBegin; 143 cusparsestruct->use_cpu_solve = use_cpu; 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 /*@ 148 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 149 150 Input Parameters: 151 + A - Matrix of type `MATSEQAIJCUSPARSE` 152 - use_cpu - set flag for using the built-in CPU `MatSolve()` 153 154 Level: intermediate 155 156 Note: 157 The cuSparse LU solver currently computes the factors with the built-in CPU method 158 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160 161 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 162 @*/ 163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 164 { 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 167 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 168 PetscFunctionReturn(PETSC_SUCCESS); 169 } 170 171 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 172 { 173 PetscFunctionBegin; 174 switch (op) { 175 case MAT_FORM_EXPLICIT_TRANSPOSE: 176 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 177 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 178 A->form_explicit_transpose = flg; 179 break; 180 default: 181 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 182 break; 183 } 184 PetscFunctionReturn(PETSC_SUCCESS); 185 } 186 187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 188 { 189 MatCUSPARSEStorageFormat format; 190 PetscBool flg; 191 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 192 193 PetscFunctionBegin; 194 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 195 if (A->factortype == MAT_FACTOR_NONE) { 196 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 197 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 198 199 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 200 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 201 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 202 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 204 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 205 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 206 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 207 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 208 #else 209 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 210 #endif 211 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 212 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 213 214 PetscCall( 215 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 216 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 217 #endif 218 } 219 PetscOptionsHeadEnd(); 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 225 { 226 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 227 PetscInt m = A->rmap->n; 228 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 229 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 230 const MatScalar *Aa = a->a; 231 PetscInt *Mi, *Mj, Mnz; 232 PetscScalar *Ma; 233 234 PetscFunctionBegin; 235 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 236 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 237 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 238 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 239 PetscCall(PetscMalloc1(m + 1, &Mi)); 240 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 241 PetscCall(PetscMalloc1(Mnz, &Ma)); 242 Mi[0] = 0; 243 for (PetscInt i = 0; i < m; i++) { 244 PetscInt llen = Ai[i + 1] - Ai[i]; 245 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 246 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 247 Mj[Mi[i] + llen] = i; // diagonal entry 248 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 249 Mi[i + 1] = Mi[i] + llen + ulen; 250 } 251 // Copy M (L,U) from host to device 252 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 253 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 254 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 255 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 256 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 257 258 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 259 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 260 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 261 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 262 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 263 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 264 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 265 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 266 267 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 268 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 269 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 270 271 fillMode = CUSPARSE_FILL_MODE_UPPER; 272 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 273 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 274 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 275 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 276 277 // Allocate work vectors in SpSv 278 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 279 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 280 281 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 282 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 283 284 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 285 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 286 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 287 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 288 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 289 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 290 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 291 292 // Record for reuse 293 fs->csrRowPtr_h = Mi; 294 fs->csrVal_h = Ma; 295 PetscCall(PetscFree(Mj)); 296 } 297 // Copy the value 298 Mi = fs->csrRowPtr_h; 299 Ma = fs->csrVal_h; 300 Mnz = Mi[m]; 301 for (PetscInt i = 0; i < m; i++) { 302 PetscInt llen = Ai[i + 1] - Ai[i]; 303 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 304 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 305 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 306 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 307 } 308 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 309 310 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 311 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 312 313 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 314 315 // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve 316 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 317 } 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 #else 321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 322 { 323 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 324 PetscInt n = A->rmap->n; 325 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 326 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 327 const PetscInt *ai = a->i, *aj = a->j, *vi; 328 const MatScalar *aa = a->a, *v; 329 PetscInt *AiLo, *AjLo; 330 PetscInt i, nz, nzLower, offset, rowOffset; 331 332 PetscFunctionBegin; 333 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 334 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 335 try { 336 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 337 nzLower = n + ai[n] - ai[1]; 338 if (!loTriFactor) { 339 PetscScalar *AALo; 340 341 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 342 343 /* Allocate Space for the lower triangular matrix */ 344 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 345 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 346 347 /* Fill the lower triangular matrix */ 348 AiLo[0] = (PetscInt)0; 349 AiLo[n] = nzLower; 350 AjLo[0] = (PetscInt)0; 351 AALo[0] = (MatScalar)1.0; 352 v = aa; 353 vi = aj; 354 offset = 1; 355 rowOffset = 1; 356 for (i = 1; i < n; i++) { 357 nz = ai[i + 1] - ai[i]; 358 /* additional 1 for the term on the diagonal */ 359 AiLo[i] = rowOffset; 360 rowOffset += nz + 1; 361 362 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 363 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 364 365 offset += nz; 366 AjLo[offset] = (PetscInt)i; 367 AALo[offset] = (MatScalar)1.0; 368 offset += 1; 369 370 v += nz; 371 vi += nz; 372 } 373 374 /* allocate space for the triangular factor information */ 375 PetscCall(PetscNew(&loTriFactor)); 376 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 377 /* Create the matrix description */ 378 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 379 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 380 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 381 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 382 #else 383 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 384 #endif 385 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 386 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 387 388 /* set the operation */ 389 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 390 391 /* set the matrix */ 392 loTriFactor->csrMat = new CsrMatrix; 393 loTriFactor->csrMat->num_rows = n; 394 loTriFactor->csrMat->num_cols = n; 395 loTriFactor->csrMat->num_entries = nzLower; 396 397 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 398 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 399 400 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 401 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 402 403 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 404 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 405 406 /* Create the solve analysis information */ 407 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 408 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 409 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 410 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 411 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 412 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 413 #endif 414 415 /* perform the solve analysis */ 416 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 417 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 418 PetscCallCUDA(WaitForCUDA()); 419 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 420 421 /* assign the pointer */ 422 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 423 loTriFactor->AA_h = AALo; 424 PetscCallCUDA(cudaFreeHost(AiLo)); 425 PetscCallCUDA(cudaFreeHost(AjLo)); 426 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 427 } else { /* update values only */ 428 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 429 /* Fill the lower triangular matrix */ 430 loTriFactor->AA_h[0] = 1.0; 431 v = aa; 432 vi = aj; 433 offset = 1; 434 for (i = 1; i < n; i++) { 435 nz = ai[i + 1] - ai[i]; 436 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 437 offset += nz; 438 loTriFactor->AA_h[offset] = 1.0; 439 offset += 1; 440 v += nz; 441 } 442 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 443 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 444 } 445 } catch (char *ex) { 446 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 447 } 448 } 449 PetscFunctionReturn(PETSC_SUCCESS); 450 } 451 452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 453 { 454 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 455 PetscInt n = A->rmap->n; 456 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 457 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 458 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 459 const MatScalar *aa = a->a, *v; 460 PetscInt *AiUp, *AjUp; 461 PetscInt i, nz, nzUpper, offset; 462 463 PetscFunctionBegin; 464 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 465 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 466 try { 467 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 468 nzUpper = adiag[0] - adiag[n]; 469 if (!upTriFactor) { 470 PetscScalar *AAUp; 471 472 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 473 474 /* Allocate Space for the upper triangular matrix */ 475 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 476 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 477 478 /* Fill the upper triangular matrix */ 479 AiUp[0] = (PetscInt)0; 480 AiUp[n] = nzUpper; 481 offset = nzUpper; 482 for (i = n - 1; i >= 0; i--) { 483 v = aa + adiag[i + 1] + 1; 484 vi = aj + adiag[i + 1] + 1; 485 486 /* number of elements NOT on the diagonal */ 487 nz = adiag[i] - adiag[i + 1] - 1; 488 489 /* decrement the offset */ 490 offset -= (nz + 1); 491 492 /* first, set the diagonal elements */ 493 AjUp[offset] = (PetscInt)i; 494 AAUp[offset] = (MatScalar)1. / v[nz]; 495 AiUp[i] = AiUp[i + 1] - (nz + 1); 496 497 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 498 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 499 } 500 501 /* allocate space for the triangular factor information */ 502 PetscCall(PetscNew(&upTriFactor)); 503 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 504 505 /* Create the matrix description */ 506 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 507 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 508 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 509 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 510 #else 511 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 512 #endif 513 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 514 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 515 516 /* set the operation */ 517 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 518 519 /* set the matrix */ 520 upTriFactor->csrMat = new CsrMatrix; 521 upTriFactor->csrMat->num_rows = n; 522 upTriFactor->csrMat->num_cols = n; 523 upTriFactor->csrMat->num_entries = nzUpper; 524 525 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 526 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 527 528 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 529 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 530 531 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 532 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 533 534 /* Create the solve analysis information */ 535 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 536 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 537 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 538 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 539 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 540 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 541 #endif 542 543 /* perform the solve analysis */ 544 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 545 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 546 547 PetscCallCUDA(WaitForCUDA()); 548 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 549 550 /* assign the pointer */ 551 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 552 upTriFactor->AA_h = AAUp; 553 PetscCallCUDA(cudaFreeHost(AiUp)); 554 PetscCallCUDA(cudaFreeHost(AjUp)); 555 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 556 } else { 557 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 558 /* Fill the upper triangular matrix */ 559 offset = nzUpper; 560 for (i = n - 1; i >= 0; i--) { 561 v = aa + adiag[i + 1] + 1; 562 563 /* number of elements NOT on the diagonal */ 564 nz = adiag[i] - adiag[i + 1] - 1; 565 566 /* decrement the offset */ 567 offset -= (nz + 1); 568 569 /* first, set the diagonal elements */ 570 upTriFactor->AA_h[offset] = 1. / v[nz]; 571 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 572 } 573 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 574 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 575 } 576 } catch (char *ex) { 577 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 578 } 579 } 580 PetscFunctionReturn(PETSC_SUCCESS); 581 } 582 #endif 583 584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 585 { 586 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 587 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 588 IS isrow = a->row, iscol = a->icol; 589 PetscBool row_identity, col_identity; 590 PetscInt n = A->rmap->n; 591 592 PetscFunctionBegin; 593 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 594 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 595 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 596 #else 597 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 598 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 599 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 600 #endif 601 602 cusparseTriFactors->nnz = a->nz; 603 604 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 605 /* lower triangular indices */ 606 PetscCall(ISIdentity(isrow, &row_identity)); 607 if (!row_identity && !cusparseTriFactors->rpermIndices) { 608 const PetscInt *r; 609 610 PetscCall(ISGetIndices(isrow, &r)); 611 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 612 cusparseTriFactors->rpermIndices->assign(r, r + n); 613 PetscCall(ISRestoreIndices(isrow, &r)); 614 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 615 } 616 617 /* upper triangular indices */ 618 PetscCall(ISIdentity(iscol, &col_identity)); 619 if (!col_identity && !cusparseTriFactors->cpermIndices) { 620 const PetscInt *c; 621 622 PetscCall(ISGetIndices(iscol, &c)); 623 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 624 cusparseTriFactors->cpermIndices->assign(c, c + n); 625 PetscCall(ISRestoreIndices(iscol, &c)); 626 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 627 } 628 PetscFunctionReturn(PETSC_SUCCESS); 629 } 630 631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 633 { 634 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 635 PetscInt m = A->rmap->n; 636 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 637 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 638 const MatScalar *Aa = a->a; 639 PetscInt *Mj, Mnz; 640 PetscScalar *Ma, *D; 641 642 PetscFunctionBegin; 643 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 644 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 645 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 646 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 647 Mnz = Ai[m]; // Unz (with the unit diagonal) 648 PetscCall(PetscMalloc1(Mnz, &Ma)); 649 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 650 PetscCall(PetscMalloc1(m, &D)); // the diagonal 651 for (PetscInt i = 0; i < m; i++) { 652 PetscInt ulen = Ai[i + 1] - Ai[i]; 653 Mj[Ai[i]] = i; // diagonal entry 654 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 655 } 656 // Copy M (U) from host to device 657 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 658 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 659 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 660 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 661 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 662 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 663 664 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 665 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 666 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 667 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 668 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 669 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 670 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 671 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 672 673 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 674 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 675 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 676 677 // Allocate work vectors in SpSv 678 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 679 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 680 681 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 682 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 683 684 // Query buffer sizes for SpSV and then allocate buffers 685 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 686 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 687 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 688 689 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 690 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 691 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 692 693 // Record for reuse 694 fs->csrVal_h = Ma; 695 fs->diag_h = D; 696 PetscCall(PetscFree(Mj)); 697 } 698 // Copy the value 699 Ma = fs->csrVal_h; 700 D = fs->diag_h; 701 Mnz = Ai[m]; 702 for (PetscInt i = 0; i < m; i++) { 703 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 704 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 705 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 706 } 707 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 708 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 709 710 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 711 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 712 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 713 } 714 PetscFunctionReturn(PETSC_SUCCESS); 715 } 716 717 // Solve Ut D U x = b 718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 719 { 720 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 721 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 722 const PetscScalar *barray; 723 PetscScalar *xarray; 724 thrust::device_ptr<const PetscScalar> bGPU; 725 thrust::device_ptr<PetscScalar> xGPU; 726 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 727 PetscInt m = A->rmap->n; 728 729 PetscFunctionBegin; 730 PetscCall(PetscLogGpuTimeBegin()); 731 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 732 PetscCall(VecCUDAGetArrayRead(b, &barray)); 733 xGPU = thrust::device_pointer_cast(xarray); 734 bGPU = thrust::device_pointer_cast(barray); 735 736 // Reorder b with the row permutation if needed, and wrap the result in fs->X 737 if (fs->rpermIndices) { 738 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 739 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 740 } else { 741 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 742 } 743 744 // Solve Ut Y = X 745 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 746 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 747 748 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 749 // It is basically a vector element-wise multiplication, but cublas does not have it! 750 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 751 752 // Solve U X = Y 753 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 754 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 755 } else { 756 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 757 } 758 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 759 760 // Reorder X with the column permutation if needed, and put the result back to x 761 if (fs->cpermIndices) { 762 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 763 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 764 } 765 766 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 767 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 768 PetscCall(PetscLogGpuTimeEnd()); 769 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 770 PetscFunctionReturn(PETSC_SUCCESS); 771 } 772 #else 773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 774 { 775 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 776 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 777 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 778 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 779 PetscInt *AiUp, *AjUp; 780 PetscScalar *AAUp; 781 PetscScalar *AALo; 782 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 783 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 784 const PetscInt *ai = b->i, *aj = b->j, *vj; 785 const MatScalar *aa = b->a, *v; 786 787 PetscFunctionBegin; 788 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 789 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 790 try { 791 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 792 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 793 if (!upTriFactor && !loTriFactor) { 794 /* Allocate Space for the upper triangular matrix */ 795 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 796 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 797 798 /* Fill the upper triangular matrix */ 799 AiUp[0] = (PetscInt)0; 800 AiUp[n] = nzUpper; 801 offset = 0; 802 for (i = 0; i < n; i++) { 803 /* set the pointers */ 804 v = aa + ai[i]; 805 vj = aj + ai[i]; 806 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 807 808 /* first, set the diagonal elements */ 809 AjUp[offset] = (PetscInt)i; 810 AAUp[offset] = (MatScalar)1.0 / v[nz]; 811 AiUp[i] = offset; 812 AALo[offset] = (MatScalar)1.0 / v[nz]; 813 814 offset += 1; 815 if (nz > 0) { 816 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 817 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 818 for (j = offset; j < offset + nz; j++) { 819 AAUp[j] = -AAUp[j]; 820 AALo[j] = AAUp[j] / v[nz]; 821 } 822 offset += nz; 823 } 824 } 825 826 /* allocate space for the triangular factor information */ 827 PetscCall(PetscNew(&upTriFactor)); 828 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 829 830 /* Create the matrix description */ 831 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 832 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 833 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 834 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 835 #else 836 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 837 #endif 838 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 839 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 840 841 /* set the matrix */ 842 upTriFactor->csrMat = new CsrMatrix; 843 upTriFactor->csrMat->num_rows = A->rmap->n; 844 upTriFactor->csrMat->num_cols = A->cmap->n; 845 upTriFactor->csrMat->num_entries = a->nz; 846 847 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 848 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 849 850 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 852 853 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 855 856 /* set the operation */ 857 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 858 859 /* Create the solve analysis information */ 860 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 861 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 862 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 863 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 864 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 865 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 866 #endif 867 868 /* perform the solve analysis */ 869 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 870 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 871 872 PetscCallCUDA(WaitForCUDA()); 873 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 874 875 /* assign the pointer */ 876 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 877 878 /* allocate space for the triangular factor information */ 879 PetscCall(PetscNew(&loTriFactor)); 880 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 881 882 /* Create the matrix description */ 883 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 884 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 885 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 886 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 887 #else 888 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 889 #endif 890 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 891 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 892 893 /* set the operation */ 894 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 895 896 /* set the matrix */ 897 loTriFactor->csrMat = new CsrMatrix; 898 loTriFactor->csrMat->num_rows = A->rmap->n; 899 loTriFactor->csrMat->num_cols = A->cmap->n; 900 loTriFactor->csrMat->num_entries = a->nz; 901 902 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 903 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 904 905 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 906 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 907 908 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 909 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 910 911 /* Create the solve analysis information */ 912 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 913 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 914 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 915 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 916 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 917 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 918 #endif 919 920 /* perform the solve analysis */ 921 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 922 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 923 924 PetscCallCUDA(WaitForCUDA()); 925 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 926 927 /* assign the pointer */ 928 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 929 930 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 931 PetscCallCUDA(cudaFreeHost(AiUp)); 932 PetscCallCUDA(cudaFreeHost(AjUp)); 933 } else { 934 /* Fill the upper triangular matrix */ 935 offset = 0; 936 for (i = 0; i < n; i++) { 937 /* set the pointers */ 938 v = aa + ai[i]; 939 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 940 941 /* first, set the diagonal elements */ 942 AAUp[offset] = 1.0 / v[nz]; 943 AALo[offset] = 1.0 / v[nz]; 944 945 offset += 1; 946 if (nz > 0) { 947 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 948 for (j = offset; j < offset + nz; j++) { 949 AAUp[j] = -AAUp[j]; 950 AALo[j] = AAUp[j] / v[nz]; 951 } 952 offset += nz; 953 } 954 } 955 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 956 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 957 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 958 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 959 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 960 } 961 PetscCallCUDA(cudaFreeHost(AAUp)); 962 PetscCallCUDA(cudaFreeHost(AALo)); 963 } catch (char *ex) { 964 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 965 } 966 } 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 #endif 970 971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 972 { 973 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 974 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 975 IS ip = a->row; 976 PetscBool perm_identity; 977 PetscInt n = A->rmap->n; 978 979 PetscFunctionBegin; 980 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 981 982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 983 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 984 #else 985 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 986 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 987 #endif 988 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 989 990 A->offloadmask = PETSC_OFFLOAD_BOTH; 991 992 /* lower triangular indices */ 993 PetscCall(ISIdentity(ip, &perm_identity)); 994 if (!perm_identity) { 995 IS iip; 996 const PetscInt *irip, *rip; 997 998 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 999 PetscCall(ISGetIndices(iip, &irip)); 1000 PetscCall(ISGetIndices(ip, &rip)); 1001 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1002 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1003 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1004 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1005 PetscCall(ISRestoreIndices(iip, &irip)); 1006 PetscCall(ISDestroy(&iip)); 1007 PetscCall(ISRestoreIndices(ip, &rip)); 1008 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1009 } 1010 PetscFunctionReturn(PETSC_SUCCESS); 1011 } 1012 1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1014 { 1015 PetscFunctionBegin; 1016 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1017 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1018 B->offloadmask = PETSC_OFFLOAD_CPU; 1019 1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1021 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1022 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1023 #else 1024 /* determine which version of MatSolve needs to be used. */ 1025 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1026 IS ip = b->row; 1027 PetscBool perm_identity; 1028 1029 PetscCall(ISIdentity(ip, &perm_identity)); 1030 if (perm_identity) { 1031 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1032 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1033 } else { 1034 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1035 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1036 } 1037 #endif 1038 B->ops->matsolve = NULL; 1039 B->ops->matsolvetranspose = NULL; 1040 1041 /* get the triangular factors */ 1042 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1043 PetscFunctionReturn(PETSC_SUCCESS); 1044 } 1045 1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1048 { 1049 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1050 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1051 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1052 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1053 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1054 cusparseIndexBase_t indexBase; 1055 cusparseMatrixType_t matrixType; 1056 cusparseFillMode_t fillMode; 1057 cusparseDiagType_t diagType; 1058 1059 PetscFunctionBegin; 1060 /* allocate space for the transpose of the lower triangular factor */ 1061 PetscCall(PetscNew(&loTriFactorT)); 1062 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1063 1064 /* set the matrix descriptors of the lower triangular factor */ 1065 matrixType = cusparseGetMatType(loTriFactor->descr); 1066 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1067 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1068 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1069 1070 /* Create the matrix description */ 1071 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1072 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1073 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1074 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1075 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1076 1077 /* set the operation */ 1078 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1079 1080 /* allocate GPU space for the CSC of the lower triangular factor*/ 1081 loTriFactorT->csrMat = new CsrMatrix; 1082 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1083 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1084 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1085 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1086 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1087 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1088 1089 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1090 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1091 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1092 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1093 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1094 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1095 #endif 1096 1097 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1098 { 1099 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1100 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1101 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1102 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1103 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1104 #else 1105 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1106 #endif 1107 PetscCallCUSPARSE(stat); 1108 } 1109 1110 PetscCallCUDA(WaitForCUDA()); 1111 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1112 1113 /* Create the solve analysis information */ 1114 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1115 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1116 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1117 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1118 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1119 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1120 #endif 1121 1122 /* perform the solve analysis */ 1123 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1124 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1125 1126 PetscCallCUDA(WaitForCUDA()); 1127 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1128 1129 /* assign the pointer */ 1130 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1131 1132 /*********************************************/ 1133 /* Now the Transpose of the Upper Tri Factor */ 1134 /*********************************************/ 1135 1136 /* allocate space for the transpose of the upper triangular factor */ 1137 PetscCall(PetscNew(&upTriFactorT)); 1138 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1139 1140 /* set the matrix descriptors of the upper triangular factor */ 1141 matrixType = cusparseGetMatType(upTriFactor->descr); 1142 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1143 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1144 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1145 1146 /* Create the matrix description */ 1147 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1148 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1149 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1150 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1151 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1152 1153 /* set the operation */ 1154 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1155 1156 /* allocate GPU space for the CSC of the upper triangular factor*/ 1157 upTriFactorT->csrMat = new CsrMatrix; 1158 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1159 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1160 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1161 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1162 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1163 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1164 1165 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1166 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1167 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1168 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1169 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1170 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1171 #endif 1172 1173 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1174 { 1175 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1176 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1177 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1178 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1179 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1180 #else 1181 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1182 #endif 1183 PetscCallCUSPARSE(stat); 1184 } 1185 1186 PetscCallCUDA(WaitForCUDA()); 1187 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1188 1189 /* Create the solve analysis information */ 1190 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1191 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1192 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1193 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1194 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1195 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1196 #endif 1197 1198 /* perform the solve analysis */ 1199 /* christ, would it have killed you to put this stuff in a function????????? */ 1200 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1201 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1202 1203 PetscCallCUDA(WaitForCUDA()); 1204 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1205 1206 /* assign the pointer */ 1207 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1208 PetscFunctionReturn(PETSC_SUCCESS); 1209 } 1210 #endif 1211 1212 struct PetscScalarToPetscInt { 1213 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1214 }; 1215 1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1217 { 1218 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1219 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1221 cusparseStatus_t stat; 1222 cusparseIndexBase_t indexBase; 1223 1224 PetscFunctionBegin; 1225 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1226 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1227 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1228 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1229 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1230 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1231 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1232 PetscCall(PetscLogGpuTimeBegin()); 1233 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1234 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1235 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1236 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1237 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1238 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1239 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1240 1241 /* set alpha and beta */ 1242 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1243 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1244 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1245 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1246 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1247 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1248 1249 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1250 CsrMatrix *matrixT = new CsrMatrix; 1251 matstructT->mat = matrixT; 1252 matrixT->num_rows = A->cmap->n; 1253 matrixT->num_cols = A->rmap->n; 1254 matrixT->num_entries = a->nz; 1255 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1256 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1257 matrixT->values = new THRUSTARRAY(a->nz); 1258 1259 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1260 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1261 1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1264 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265 indexBase, cusparse_scalartype); 1266 PetscCallCUSPARSE(stat); 1267 #else 1268 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1269 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1270 1271 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1272 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1273 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1274 */ 1275 if (matrixT->num_entries) { 1276 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1277 PetscCallCUSPARSE(stat); 1278 1279 } else { 1280 matstructT->matDescr = NULL; 1281 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1282 } 1283 #endif 1284 #endif 1285 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1287 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1288 #else 1289 CsrMatrix *temp = new CsrMatrix; 1290 CsrMatrix *tempT = new CsrMatrix; 1291 /* First convert HYB to CSR */ 1292 temp->num_rows = A->rmap->n; 1293 temp->num_cols = A->cmap->n; 1294 temp->num_entries = a->nz; 1295 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1296 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1297 temp->values = new THRUSTARRAY(a->nz); 1298 1299 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1300 PetscCallCUSPARSE(stat); 1301 1302 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1303 tempT->num_rows = A->rmap->n; 1304 tempT->num_cols = A->cmap->n; 1305 tempT->num_entries = a->nz; 1306 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1307 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1308 tempT->values = new THRUSTARRAY(a->nz); 1309 1310 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1311 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1312 PetscCallCUSPARSE(stat); 1313 1314 /* Last, convert CSC to HYB */ 1315 cusparseHybMat_t hybMat; 1316 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1317 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1318 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1319 PetscCallCUSPARSE(stat); 1320 1321 /* assign the pointer */ 1322 matstructT->mat = hybMat; 1323 A->transupdated = PETSC_TRUE; 1324 /* delete temporaries */ 1325 if (tempT) { 1326 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1327 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1328 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1329 delete (CsrMatrix *)tempT; 1330 } 1331 if (temp) { 1332 if (temp->values) delete (THRUSTARRAY *)temp->values; 1333 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1334 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1335 delete (CsrMatrix *)temp; 1336 } 1337 #endif 1338 } 1339 } 1340 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1341 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1342 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1343 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1344 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1345 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1346 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1347 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1348 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1349 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1350 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1351 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1352 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1353 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1354 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1355 } 1356 if (!cusparsestruct->csr2csc_i) { 1357 THRUSTARRAY csr2csc_a(matrix->num_entries); 1358 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1359 1360 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1362 void *csr2cscBuffer; 1363 size_t csr2cscBufferSize; 1364 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1365 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1366 PetscCallCUSPARSE(stat); 1367 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1368 #endif 1369 1370 if (matrix->num_entries) { 1371 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1372 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1373 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1374 1375 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1376 should be filled with indexBase. So I just take a shortcut here. 1377 */ 1378 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1380 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1381 PetscCallCUSPARSE(stat); 1382 #else 1383 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1384 PetscCallCUSPARSE(stat); 1385 #endif 1386 } else { 1387 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1388 } 1389 1390 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1391 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1393 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1394 #endif 1395 } 1396 PetscCallThrust( 1397 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1398 } 1399 PetscCall(PetscLogGpuTimeEnd()); 1400 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1401 /* the compressed row indices is not used for matTranspose */ 1402 matstructT->cprowIndices = NULL; 1403 /* assign the pointer */ 1404 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1405 A->transupdated = PETSC_TRUE; 1406 PetscFunctionReturn(PETSC_SUCCESS); 1407 } 1408 1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1411 { 1412 const PetscScalar *barray; 1413 PetscScalar *xarray; 1414 thrust::device_ptr<const PetscScalar> bGPU; 1415 thrust::device_ptr<PetscScalar> xGPU; 1416 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1417 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1418 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1419 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1420 PetscInt m = A->rmap->n; 1421 1422 PetscFunctionBegin; 1423 PetscCall(PetscLogGpuTimeBegin()); 1424 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1425 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1426 xGPU = thrust::device_pointer_cast(xarray); 1427 bGPU = thrust::device_pointer_cast(barray); 1428 1429 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1430 if (fs->rpermIndices) { 1431 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1432 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1433 } else { 1434 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1435 } 1436 1437 // Solve L Y = X 1438 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1439 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1440 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1441 1442 // Solve U X = Y 1443 if (fs->cpermIndices) { 1444 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1445 } else { 1446 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1447 } 1448 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1449 1450 // Reorder X with the column permutation if needed, and put the result back to x 1451 if (fs->cpermIndices) { 1452 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1453 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1454 } 1455 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1456 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1457 PetscCall(PetscLogGpuTimeEnd()); 1458 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1459 PetscFunctionReturn(PETSC_SUCCESS); 1460 } 1461 1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1463 { 1464 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1465 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1466 const PetscScalar *barray; 1467 PetscScalar *xarray; 1468 thrust::device_ptr<const PetscScalar> bGPU; 1469 thrust::device_ptr<PetscScalar> xGPU; 1470 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1471 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1472 PetscInt m = A->rmap->n; 1473 1474 PetscFunctionBegin; 1475 PetscCall(PetscLogGpuTimeBegin()); 1476 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1477 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1478 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1479 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1480 1481 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1482 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1483 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1484 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1485 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1486 } 1487 1488 if (!fs->updatedTransposeSpSVAnalysis) { 1489 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1490 1491 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1492 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1493 } 1494 1495 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1496 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1497 xGPU = thrust::device_pointer_cast(xarray); 1498 bGPU = thrust::device_pointer_cast(barray); 1499 1500 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1501 if (fs->rpermIndices) { 1502 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1503 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1504 } else { 1505 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1506 } 1507 1508 // Solve Ut Y = X 1509 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1510 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1511 1512 // Solve Lt X = Y 1513 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1514 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1515 } else { 1516 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1517 } 1518 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1519 1520 // Reorder X with the column permutation if needed, and put the result back to x 1521 if (fs->cpermIndices) { 1522 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1523 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1524 } 1525 1526 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1527 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1528 PetscCall(PetscLogGpuTimeEnd()); 1529 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1530 PetscFunctionReturn(PETSC_SUCCESS); 1531 } 1532 #else 1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1535 { 1536 PetscInt n = xx->map->n; 1537 const PetscScalar *barray; 1538 PetscScalar *xarray; 1539 thrust::device_ptr<const PetscScalar> bGPU; 1540 thrust::device_ptr<PetscScalar> xGPU; 1541 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1542 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1543 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1544 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1545 1546 PetscFunctionBegin; 1547 /* Analyze the matrix and create the transpose ... on the fly */ 1548 if (!loTriFactorT && !upTriFactorT) { 1549 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1550 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1551 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1552 } 1553 1554 /* Get the GPU pointers */ 1555 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1556 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1557 xGPU = thrust::device_pointer_cast(xarray); 1558 bGPU = thrust::device_pointer_cast(barray); 1559 1560 PetscCall(PetscLogGpuTimeBegin()); 1561 /* First, reorder with the row permutation */ 1562 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1563 1564 /* First, solve U */ 1565 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1566 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1567 1568 /* Then, solve L */ 1569 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1570 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1571 1572 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1573 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1574 1575 /* Copy the temporary to the full solution. */ 1576 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1577 1578 /* restore */ 1579 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1580 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1581 PetscCall(PetscLogGpuTimeEnd()); 1582 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1583 PetscFunctionReturn(PETSC_SUCCESS); 1584 } 1585 1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1587 { 1588 const PetscScalar *barray; 1589 PetscScalar *xarray; 1590 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1591 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1593 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1594 1595 PetscFunctionBegin; 1596 /* Analyze the matrix and create the transpose ... on the fly */ 1597 if (!loTriFactorT && !upTriFactorT) { 1598 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1599 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1600 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1601 } 1602 1603 /* Get the GPU pointers */ 1604 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1605 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1606 1607 PetscCall(PetscLogGpuTimeBegin()); 1608 /* First, solve U */ 1609 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1610 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1611 1612 /* Then, solve L */ 1613 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1614 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1615 1616 /* restore */ 1617 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1618 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1619 PetscCall(PetscLogGpuTimeEnd()); 1620 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1625 { 1626 const PetscScalar *barray; 1627 PetscScalar *xarray; 1628 thrust::device_ptr<const PetscScalar> bGPU; 1629 thrust::device_ptr<PetscScalar> xGPU; 1630 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1631 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1633 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1634 1635 PetscFunctionBegin; 1636 /* Get the GPU pointers */ 1637 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1638 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1639 xGPU = thrust::device_pointer_cast(xarray); 1640 bGPU = thrust::device_pointer_cast(barray); 1641 1642 PetscCall(PetscLogGpuTimeBegin()); 1643 /* First, reorder with the row permutation */ 1644 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1645 1646 /* Next, solve L */ 1647 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1648 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1649 1650 /* Then, solve U */ 1651 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1652 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1653 1654 /* Last, reorder with the column permutation */ 1655 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1656 1657 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1658 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1659 PetscCall(PetscLogGpuTimeEnd()); 1660 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1661 PetscFunctionReturn(PETSC_SUCCESS); 1662 } 1663 1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1665 { 1666 const PetscScalar *barray; 1667 PetscScalar *xarray; 1668 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1669 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1670 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1671 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1672 1673 PetscFunctionBegin; 1674 /* Get the GPU pointers */ 1675 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1676 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1677 1678 PetscCall(PetscLogGpuTimeBegin()); 1679 /* First, solve L */ 1680 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1681 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1682 1683 /* Next, solve U */ 1684 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1685 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1686 1687 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1688 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1689 PetscCall(PetscLogGpuTimeEnd()); 1690 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1691 PetscFunctionReturn(PETSC_SUCCESS); 1692 } 1693 #endif 1694 1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1697 { 1698 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1699 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1700 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1701 CsrMatrix *Acsr; 1702 PetscInt m, nz; 1703 PetscBool flg; 1704 1705 PetscFunctionBegin; 1706 if (PetscDefined(USE_DEBUG)) { 1707 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1708 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1709 } 1710 1711 /* Copy A's value to fact */ 1712 m = fact->rmap->n; 1713 nz = aij->nz; 1714 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1715 Acsr = (CsrMatrix *)Acusp->mat->mat; 1716 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1717 1718 PetscCall(PetscLogGpuTimeBegin()); 1719 /* Factorize fact inplace */ 1720 if (m) 1721 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1722 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1723 if (PetscDefined(USE_DEBUG)) { 1724 int numerical_zero; 1725 cusparseStatus_t status; 1726 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1727 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1728 } 1729 1730 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1731 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1732 */ 1733 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1734 1735 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1736 1737 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1738 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1739 1740 fact->offloadmask = PETSC_OFFLOAD_GPU; 1741 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1742 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1743 fact->ops->matsolve = NULL; 1744 fact->ops->matsolvetranspose = NULL; 1745 PetscCall(PetscLogGpuTimeEnd()); 1746 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1747 PetscFunctionReturn(PETSC_SUCCESS); 1748 } 1749 1750 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1751 { 1752 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1753 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1754 PetscInt m, nz; 1755 1756 PetscFunctionBegin; 1757 if (PetscDefined(USE_DEBUG)) { 1758 PetscInt i; 1759 PetscBool flg, missing; 1760 1761 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1762 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1763 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1764 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1765 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1766 } 1767 1768 /* Free the old stale stuff */ 1769 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1770 1771 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1772 but they will not be used. Allocate them just for easy debugging. 1773 */ 1774 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1775 1776 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1777 fact->factortype = MAT_FACTOR_ILU; 1778 fact->info.factor_mallocs = 0; 1779 fact->info.fill_ratio_given = info->fill; 1780 fact->info.fill_ratio_needed = 1.0; 1781 1782 aij->row = NULL; 1783 aij->col = NULL; 1784 1785 /* ====================================================================== */ 1786 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1787 /* We'll do in-place factorization on fact */ 1788 /* ====================================================================== */ 1789 const int *Ai, *Aj; 1790 1791 m = fact->rmap->n; 1792 nz = aij->nz; 1793 1794 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1795 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1796 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1797 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1798 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1799 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1800 1801 /* ====================================================================== */ 1802 /* Create descriptors for M, L, U */ 1803 /* ====================================================================== */ 1804 cusparseFillMode_t fillMode; 1805 cusparseDiagType_t diagType; 1806 1807 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1808 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1809 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1810 1811 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1812 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1813 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1814 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1815 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1816 */ 1817 fillMode = CUSPARSE_FILL_MODE_LOWER; 1818 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1819 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1820 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1821 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1822 1823 fillMode = CUSPARSE_FILL_MODE_UPPER; 1824 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1825 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1826 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1827 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1828 1829 /* ========================================================================= */ 1830 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1831 /* ========================================================================= */ 1832 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1833 if (m) 1834 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1835 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1836 1837 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1838 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1839 1840 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1841 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1842 1843 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1844 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1845 1846 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1847 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1848 1849 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1850 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1851 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1852 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1853 */ 1854 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1855 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1856 fs->spsvBuffer_L = fs->factBuffer_M; 1857 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1858 } else { 1859 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1860 fs->spsvBuffer_U = fs->factBuffer_M; 1861 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1862 } 1863 1864 /* ========================================================================== */ 1865 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1866 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1867 /* ========================================================================== */ 1868 int structural_zero; 1869 cusparseStatus_t status; 1870 1871 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1872 if (m) 1873 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1874 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1875 if (PetscDefined(USE_DEBUG)) { 1876 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1877 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1878 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1879 } 1880 1881 /* Estimate FLOPs of the numeric factorization */ 1882 { 1883 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1884 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1885 PetscLogDouble flops = 0.0; 1886 1887 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1888 Ai = Aseq->i; 1889 Adiag = Aseq->diag; 1890 for (PetscInt i = 0; i < m; i++) { 1891 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1892 nzRow = Ai[i + 1] - Ai[i]; 1893 nzLeft = Adiag[i] - Ai[i]; 1894 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1895 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1896 */ 1897 nzLeft = (nzRow - 1) / 2; 1898 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1899 } 1900 } 1901 fs->numericFactFlops = flops; 1902 } 1903 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1904 PetscFunctionReturn(PETSC_SUCCESS); 1905 } 1906 1907 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1908 { 1909 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1910 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1911 const PetscScalar *barray; 1912 PetscScalar *xarray; 1913 1914 PetscFunctionBegin; 1915 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1916 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1917 PetscCall(PetscLogGpuTimeBegin()); 1918 1919 /* Solve L*y = b */ 1920 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1921 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1922 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1923 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1924 1925 /* Solve Lt*x = y */ 1926 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1927 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1928 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1929 1930 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1931 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1932 1933 PetscCall(PetscLogGpuTimeEnd()); 1934 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1935 PetscFunctionReturn(PETSC_SUCCESS); 1936 } 1937 1938 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1939 { 1940 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1941 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1942 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1943 CsrMatrix *Acsr; 1944 PetscInt m, nz; 1945 PetscBool flg; 1946 1947 PetscFunctionBegin; 1948 if (PetscDefined(USE_DEBUG)) { 1949 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1950 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1951 } 1952 1953 /* Copy A's value to fact */ 1954 m = fact->rmap->n; 1955 nz = aij->nz; 1956 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1957 Acsr = (CsrMatrix *)Acusp->mat->mat; 1958 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1959 1960 /* Factorize fact inplace */ 1961 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1962 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1963 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1964 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1965 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1966 */ 1967 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1968 if (PetscDefined(USE_DEBUG)) { 1969 int numerical_zero; 1970 cusparseStatus_t status; 1971 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1972 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1973 } 1974 1975 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1976 1977 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1978 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1979 */ 1980 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1981 1982 fact->offloadmask = PETSC_OFFLOAD_GPU; 1983 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1984 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1985 fact->ops->matsolve = NULL; 1986 fact->ops->matsolvetranspose = NULL; 1987 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1988 PetscFunctionReturn(PETSC_SUCCESS); 1989 } 1990 1991 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1992 { 1993 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1994 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1995 PetscInt m, nz; 1996 1997 PetscFunctionBegin; 1998 if (PetscDefined(USE_DEBUG)) { 1999 PetscInt i; 2000 PetscBool flg, missing; 2001 2002 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2003 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2004 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2005 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2006 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2007 } 2008 2009 /* Free the old stale stuff */ 2010 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2011 2012 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2013 but they will not be used. Allocate them just for easy debugging. 2014 */ 2015 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2016 2017 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2018 fact->factortype = MAT_FACTOR_ICC; 2019 fact->info.factor_mallocs = 0; 2020 fact->info.fill_ratio_given = info->fill; 2021 fact->info.fill_ratio_needed = 1.0; 2022 2023 aij->row = NULL; 2024 aij->col = NULL; 2025 2026 /* ====================================================================== */ 2027 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2028 /* We'll do in-place factorization on fact */ 2029 /* ====================================================================== */ 2030 const int *Ai, *Aj; 2031 2032 m = fact->rmap->n; 2033 nz = aij->nz; 2034 2035 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2036 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2037 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2038 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2039 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2040 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2041 2042 /* ====================================================================== */ 2043 /* Create mat descriptors for M, L */ 2044 /* ====================================================================== */ 2045 cusparseFillMode_t fillMode; 2046 cusparseDiagType_t diagType; 2047 2048 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2049 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2050 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2051 2052 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2053 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2054 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2055 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2056 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2057 */ 2058 fillMode = CUSPARSE_FILL_MODE_LOWER; 2059 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2060 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2061 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2062 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2063 2064 /* ========================================================================= */ 2065 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2066 /* ========================================================================= */ 2067 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2068 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2069 2070 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2071 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2072 2073 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2074 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2075 2076 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2077 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2078 2079 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2080 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2081 2082 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2083 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2084 */ 2085 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2086 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2087 fs->spsvBuffer_L = fs->factBuffer_M; 2088 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2089 } else { 2090 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2091 fs->spsvBuffer_Lt = fs->factBuffer_M; 2092 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2093 } 2094 2095 /* ========================================================================== */ 2096 /* Perform analysis of ic0 on M */ 2097 /* The lower triangular part of M has the same sparsity pattern as L */ 2098 /* ========================================================================== */ 2099 int structural_zero; 2100 cusparseStatus_t status; 2101 2102 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2103 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2104 if (PetscDefined(USE_DEBUG)) { 2105 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2106 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2107 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2108 } 2109 2110 /* Estimate FLOPs of the numeric factorization */ 2111 { 2112 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2113 PetscInt *Ai, nzRow, nzLeft; 2114 PetscLogDouble flops = 0.0; 2115 2116 Ai = Aseq->i; 2117 for (PetscInt i = 0; i < m; i++) { 2118 nzRow = Ai[i + 1] - Ai[i]; 2119 if (nzRow > 1) { 2120 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2121 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2122 */ 2123 nzLeft = (nzRow - 1) / 2; 2124 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2125 } 2126 } 2127 fs->numericFactFlops = flops; 2128 } 2129 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2130 PetscFunctionReturn(PETSC_SUCCESS); 2131 } 2132 #endif 2133 2134 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2135 { 2136 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2137 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2138 2139 PetscFunctionBegin; 2140 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2141 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2142 B->offloadmask = PETSC_OFFLOAD_CPU; 2143 2144 if (!cusparsestruct->use_cpu_solve) { 2145 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2146 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2147 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2148 #else 2149 /* determine which version of MatSolve needs to be used. */ 2150 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2151 IS isrow = b->row, iscol = b->col; 2152 PetscBool row_identity, col_identity; 2153 2154 PetscCall(ISIdentity(isrow, &row_identity)); 2155 PetscCall(ISIdentity(iscol, &col_identity)); 2156 if (row_identity && col_identity) { 2157 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2158 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2159 } else { 2160 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2161 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2162 } 2163 #endif 2164 } 2165 B->ops->matsolve = NULL; 2166 B->ops->matsolvetranspose = NULL; 2167 2168 /* get the triangular factors */ 2169 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2170 PetscFunctionReturn(PETSC_SUCCESS); 2171 } 2172 2173 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2174 { 2175 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2176 2177 PetscFunctionBegin; 2178 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2179 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2180 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2181 PetscFunctionReturn(PETSC_SUCCESS); 2182 } 2183 2184 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2185 { 2186 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2187 2188 PetscFunctionBegin; 2189 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2190 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2191 if (cusparseTriFactors->factorizeOnDevice) { 2192 PetscCall(ISIdentity(isrow, &row_identity)); 2193 PetscCall(ISIdentity(iscol, &col_identity)); 2194 } 2195 if (!info->levels && row_identity && col_identity) { 2196 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2197 } else 2198 #endif 2199 { 2200 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2201 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2202 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2203 } 2204 PetscFunctionReturn(PETSC_SUCCESS); 2205 } 2206 2207 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2208 { 2209 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2210 2211 PetscFunctionBegin; 2212 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2213 PetscBool perm_identity = PETSC_FALSE; 2214 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2215 if (!info->levels && perm_identity) { 2216 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2217 } else 2218 #endif 2219 { 2220 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2221 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2222 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2223 } 2224 PetscFunctionReturn(PETSC_SUCCESS); 2225 } 2226 2227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2228 { 2229 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2230 2231 PetscFunctionBegin; 2232 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2233 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2234 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2235 PetscFunctionReturn(PETSC_SUCCESS); 2236 } 2237 2238 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2239 { 2240 PetscFunctionBegin; 2241 *type = MATSOLVERCUSPARSE; 2242 PetscFunctionReturn(PETSC_SUCCESS); 2243 } 2244 2245 /*MC 2246 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2247 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2248 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2249 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2250 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2251 algorithms are not recommended. This class does NOT support direct solver operations. 2252 2253 Level: beginner 2254 2255 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2256 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2257 M*/ 2258 2259 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2260 { 2261 PetscInt n = A->rmap->n; 2262 PetscBool factOnDevice, factOnHost; 2263 char *prefix; 2264 char factPlace[32] = "device"; /* the default */ 2265 2266 PetscFunctionBegin; 2267 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2268 PetscCall(MatSetSizes(*B, n, n, n, n)); 2269 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2270 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2271 2272 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2273 PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat"); 2274 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2275 PetscOptionsEnd(); 2276 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2277 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2278 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2279 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2280 2281 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2282 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2283 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2284 if (!A->boundtocpu) { 2285 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2286 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2287 } else { 2288 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2289 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2290 } 2291 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2292 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2293 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2294 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2295 if (!A->boundtocpu) { 2296 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2297 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2298 } else { 2299 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2300 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2301 } 2302 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2303 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2304 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2305 2306 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2307 (*B)->canuseordering = PETSC_TRUE; 2308 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2309 PetscFunctionReturn(PETSC_SUCCESS); 2310 } 2311 2312 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2313 { 2314 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2315 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2316 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2317 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2318 #endif 2319 2320 PetscFunctionBegin; 2321 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2322 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2323 if (A->factortype == MAT_FACTOR_NONE) { 2324 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2325 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2326 } 2327 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2328 else if (fs->csrVal) { 2329 /* We have a factorized matrix on device and are able to copy it to host */ 2330 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2331 } 2332 #endif 2333 else 2334 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2335 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2336 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2337 A->offloadmask = PETSC_OFFLOAD_BOTH; 2338 } 2339 PetscFunctionReturn(PETSC_SUCCESS); 2340 } 2341 2342 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2343 { 2344 PetscFunctionBegin; 2345 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2346 *array = ((Mat_SeqAIJ *)A->data)->a; 2347 PetscFunctionReturn(PETSC_SUCCESS); 2348 } 2349 2350 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2351 { 2352 PetscFunctionBegin; 2353 A->offloadmask = PETSC_OFFLOAD_CPU; 2354 *array = NULL; 2355 PetscFunctionReturn(PETSC_SUCCESS); 2356 } 2357 2358 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2359 { 2360 PetscFunctionBegin; 2361 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2362 *array = ((Mat_SeqAIJ *)A->data)->a; 2363 PetscFunctionReturn(PETSC_SUCCESS); 2364 } 2365 2366 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2367 { 2368 PetscFunctionBegin; 2369 *array = NULL; 2370 PetscFunctionReturn(PETSC_SUCCESS); 2371 } 2372 2373 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2374 { 2375 PetscFunctionBegin; 2376 *array = ((Mat_SeqAIJ *)A->data)->a; 2377 PetscFunctionReturn(PETSC_SUCCESS); 2378 } 2379 2380 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2381 { 2382 PetscFunctionBegin; 2383 A->offloadmask = PETSC_OFFLOAD_CPU; 2384 *array = NULL; 2385 PetscFunctionReturn(PETSC_SUCCESS); 2386 } 2387 2388 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2389 { 2390 Mat_SeqAIJCUSPARSE *cusp; 2391 CsrMatrix *matrix; 2392 2393 PetscFunctionBegin; 2394 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2395 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2396 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2397 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2398 matrix = (CsrMatrix *)cusp->mat->mat; 2399 2400 if (i) { 2401 #if !defined(PETSC_USE_64BIT_INDICES) 2402 *i = matrix->row_offsets->data().get(); 2403 #else 2404 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2405 #endif 2406 } 2407 if (j) { 2408 #if !defined(PETSC_USE_64BIT_INDICES) 2409 *j = matrix->column_indices->data().get(); 2410 #else 2411 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2412 #endif 2413 } 2414 if (a) *a = matrix->values->data().get(); 2415 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2416 PetscFunctionReturn(PETSC_SUCCESS); 2417 } 2418 2419 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2420 { 2421 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2422 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2423 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2424 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2425 cusparseStatus_t stat; 2426 PetscBool both = PETSC_TRUE; 2427 2428 PetscFunctionBegin; 2429 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2430 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2431 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2432 CsrMatrix *matrix; 2433 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2434 2435 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2436 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2437 matrix->values->assign(a->a, a->a + a->nz); 2438 PetscCallCUDA(WaitForCUDA()); 2439 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2440 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2441 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2442 } else { 2443 PetscInt nnz; 2444 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2445 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2446 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2447 delete cusparsestruct->workVector; 2448 delete cusparsestruct->rowoffsets_gpu; 2449 cusparsestruct->workVector = NULL; 2450 cusparsestruct->rowoffsets_gpu = NULL; 2451 try { 2452 if (a->compressedrow.use) { 2453 m = a->compressedrow.nrows; 2454 ii = a->compressedrow.i; 2455 ridx = a->compressedrow.rindex; 2456 } else { 2457 m = A->rmap->n; 2458 ii = a->i; 2459 ridx = NULL; 2460 } 2461 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2462 if (!a->a) { 2463 nnz = ii[m]; 2464 both = PETSC_FALSE; 2465 } else nnz = a->nz; 2466 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2467 2468 /* create cusparse matrix */ 2469 cusparsestruct->nrows = m; 2470 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2471 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2472 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2473 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2474 2475 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2476 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2477 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2478 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2479 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2480 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2481 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2482 2483 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2484 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2485 /* set the matrix */ 2486 CsrMatrix *mat = new CsrMatrix; 2487 mat->num_rows = m; 2488 mat->num_cols = A->cmap->n; 2489 mat->num_entries = nnz; 2490 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2491 mat->row_offsets->assign(ii, ii + m + 1); 2492 2493 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2494 mat->column_indices->assign(a->j, a->j + nnz); 2495 2496 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2497 if (a->a) mat->values->assign(a->a, a->a + nnz); 2498 2499 /* assign the pointer */ 2500 matstruct->mat = mat; 2501 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2502 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2503 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2504 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2505 PetscCallCUSPARSE(stat); 2506 } 2507 #endif 2508 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2509 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2510 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2511 #else 2512 CsrMatrix *mat = new CsrMatrix; 2513 mat->num_rows = m; 2514 mat->num_cols = A->cmap->n; 2515 mat->num_entries = nnz; 2516 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2517 mat->row_offsets->assign(ii, ii + m + 1); 2518 2519 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2520 mat->column_indices->assign(a->j, a->j + nnz); 2521 2522 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2523 if (a->a) mat->values->assign(a->a, a->a + nnz); 2524 2525 cusparseHybMat_t hybMat; 2526 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2527 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2528 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2529 PetscCallCUSPARSE(stat); 2530 /* assign the pointer */ 2531 matstruct->mat = hybMat; 2532 2533 if (mat) { 2534 if (mat->values) delete (THRUSTARRAY *)mat->values; 2535 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2536 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2537 delete (CsrMatrix *)mat; 2538 } 2539 #endif 2540 } 2541 2542 /* assign the compressed row indices */ 2543 if (a->compressedrow.use) { 2544 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2545 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2546 matstruct->cprowIndices->assign(ridx, ridx + m); 2547 tmp = m; 2548 } else { 2549 cusparsestruct->workVector = NULL; 2550 matstruct->cprowIndices = NULL; 2551 tmp = 0; 2552 } 2553 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2554 2555 /* assign the pointer */ 2556 cusparsestruct->mat = matstruct; 2557 } catch (char *ex) { 2558 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2559 } 2560 PetscCallCUDA(WaitForCUDA()); 2561 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2562 cusparsestruct->nonzerostate = A->nonzerostate; 2563 } 2564 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2565 } 2566 PetscFunctionReturn(PETSC_SUCCESS); 2567 } 2568 2569 struct VecCUDAPlusEquals { 2570 template <typename Tuple> 2571 __host__ __device__ void operator()(Tuple t) 2572 { 2573 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2574 } 2575 }; 2576 2577 struct VecCUDAEquals { 2578 template <typename Tuple> 2579 __host__ __device__ void operator()(Tuple t) 2580 { 2581 thrust::get<1>(t) = thrust::get<0>(t); 2582 } 2583 }; 2584 2585 struct VecCUDAEqualsReverse { 2586 template <typename Tuple> 2587 __host__ __device__ void operator()(Tuple t) 2588 { 2589 thrust::get<0>(t) = thrust::get<1>(t); 2590 } 2591 }; 2592 2593 struct MatMatCusparse { 2594 PetscBool cisdense; 2595 PetscScalar *Bt; 2596 Mat X; 2597 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2598 PetscLogDouble flops; 2599 CsrMatrix *Bcsr; 2600 2601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2602 cusparseSpMatDescr_t matSpBDescr; 2603 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2604 cusparseDnMatDescr_t matBDescr; 2605 cusparseDnMatDescr_t matCDescr; 2606 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2607 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2608 void *dBuffer4; 2609 void *dBuffer5; 2610 #endif 2611 size_t mmBufferSize; 2612 void *mmBuffer; 2613 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2614 cusparseSpGEMMDescr_t spgemmDesc; 2615 #endif 2616 }; 2617 2618 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2619 { 2620 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2621 2622 PetscFunctionBegin; 2623 PetscCallCUDA(cudaFree(mmdata->Bt)); 2624 delete mmdata->Bcsr; 2625 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2626 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2627 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2628 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2629 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2630 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2631 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2632 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2633 #endif 2634 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2635 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2636 #endif 2637 PetscCall(MatDestroy(&mmdata->X)); 2638 PetscCall(PetscFree(data)); 2639 PetscFunctionReturn(PETSC_SUCCESS); 2640 } 2641 2642 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2643 2644 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2645 { 2646 Mat_Product *product = C->product; 2647 Mat A, B; 2648 PetscInt m, n, blda, clda; 2649 PetscBool flg, biscuda; 2650 Mat_SeqAIJCUSPARSE *cusp; 2651 cusparseStatus_t stat; 2652 cusparseOperation_t opA; 2653 const PetscScalar *barray; 2654 PetscScalar *carray; 2655 MatMatCusparse *mmdata; 2656 Mat_SeqAIJCUSPARSEMultStruct *mat; 2657 CsrMatrix *csrmat; 2658 2659 PetscFunctionBegin; 2660 MatCheckProduct(C, 1); 2661 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2662 mmdata = (MatMatCusparse *)product->data; 2663 A = product->A; 2664 B = product->B; 2665 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2666 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2667 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2668 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2669 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2670 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2671 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2672 switch (product->type) { 2673 case MATPRODUCT_AB: 2674 case MATPRODUCT_PtAP: 2675 mat = cusp->mat; 2676 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2677 m = A->rmap->n; 2678 n = B->cmap->n; 2679 break; 2680 case MATPRODUCT_AtB: 2681 if (!A->form_explicit_transpose) { 2682 mat = cusp->mat; 2683 opA = CUSPARSE_OPERATION_TRANSPOSE; 2684 } else { 2685 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2686 mat = cusp->matTranspose; 2687 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2688 } 2689 m = A->cmap->n; 2690 n = B->cmap->n; 2691 break; 2692 case MATPRODUCT_ABt: 2693 case MATPRODUCT_RARt: 2694 mat = cusp->mat; 2695 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2696 m = A->rmap->n; 2697 n = B->rmap->n; 2698 break; 2699 default: 2700 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2701 } 2702 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2703 csrmat = (CsrMatrix *)mat->mat; 2704 /* if the user passed a CPU matrix, copy the data to the GPU */ 2705 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2706 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2707 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2708 2709 PetscCall(MatDenseGetLDA(B, &blda)); 2710 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2711 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2712 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2713 } else { 2714 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2715 PetscCall(MatDenseGetLDA(C, &clda)); 2716 } 2717 2718 PetscCall(PetscLogGpuTimeBegin()); 2719 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2720 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2721 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2722 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2723 #else 2724 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2725 #endif 2726 2727 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2728 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2729 size_t mmBufferSize; 2730 if (mmdata->initialized && mmdata->Blda != blda) { 2731 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2732 mmdata->matBDescr = NULL; 2733 } 2734 if (!mmdata->matBDescr) { 2735 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2736 mmdata->Blda = blda; 2737 } 2738 2739 if (mmdata->initialized && mmdata->Clda != clda) { 2740 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2741 mmdata->matCDescr = NULL; 2742 } 2743 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2744 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2745 mmdata->Clda = clda; 2746 } 2747 2748 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2749 if (matADescr) { 2750 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2751 matADescr = NULL; 2752 } 2753 #endif 2754 2755 if (!matADescr) { 2756 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2757 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2758 PetscCallCUSPARSE(stat); 2759 } 2760 2761 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2762 2763 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2764 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2765 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2766 mmdata->mmBufferSize = mmBufferSize; 2767 } 2768 2769 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0 2770 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2771 #endif 2772 2773 mmdata->initialized = PETSC_TRUE; 2774 } else { 2775 /* to be safe, always update pointers of the mats */ 2776 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2777 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2778 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2779 } 2780 2781 /* do cusparseSpMM, which supports transpose on B */ 2782 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2783 #else 2784 PetscInt k; 2785 /* cusparseXcsrmm does not support transpose on B */ 2786 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2787 cublasHandle_t cublasv2handle; 2788 cublasStatus_t cerr; 2789 2790 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2791 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2792 PetscCallCUBLAS(cerr); 2793 blda = B->cmap->n; 2794 k = B->cmap->n; 2795 } else { 2796 k = B->rmap->n; 2797 } 2798 2799 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2800 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2801 PetscCallCUSPARSE(stat); 2802 #endif 2803 PetscCall(PetscLogGpuTimeEnd()); 2804 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2805 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2806 if (product->type == MATPRODUCT_RARt) { 2807 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2808 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2809 } else if (product->type == MATPRODUCT_PtAP) { 2810 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2811 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2812 } else { 2813 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2814 } 2815 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2816 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2817 PetscFunctionReturn(PETSC_SUCCESS); 2818 } 2819 2820 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2821 { 2822 Mat_Product *product = C->product; 2823 Mat A, B; 2824 PetscInt m, n; 2825 PetscBool cisdense, flg; 2826 MatMatCusparse *mmdata; 2827 Mat_SeqAIJCUSPARSE *cusp; 2828 2829 PetscFunctionBegin; 2830 MatCheckProduct(C, 1); 2831 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2832 A = product->A; 2833 B = product->B; 2834 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2835 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2836 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2837 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2838 switch (product->type) { 2839 case MATPRODUCT_AB: 2840 m = A->rmap->n; 2841 n = B->cmap->n; 2842 break; 2843 case MATPRODUCT_AtB: 2844 m = A->cmap->n; 2845 n = B->cmap->n; 2846 break; 2847 case MATPRODUCT_ABt: 2848 m = A->rmap->n; 2849 n = B->rmap->n; 2850 break; 2851 case MATPRODUCT_PtAP: 2852 m = B->cmap->n; 2853 n = B->cmap->n; 2854 break; 2855 case MATPRODUCT_RARt: 2856 m = B->rmap->n; 2857 n = B->rmap->n; 2858 break; 2859 default: 2860 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2861 } 2862 PetscCall(MatSetSizes(C, m, n, m, n)); 2863 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2864 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2865 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2866 2867 /* product data */ 2868 PetscCall(PetscNew(&mmdata)); 2869 mmdata->cisdense = cisdense; 2870 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2871 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2872 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2873 #endif 2874 /* for these products we need intermediate storage */ 2875 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2876 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2877 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2878 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2879 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2880 } else { 2881 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2882 } 2883 } 2884 C->product->data = mmdata; 2885 C->product->destroy = MatDestroy_MatMatCusparse; 2886 2887 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2888 PetscFunctionReturn(PETSC_SUCCESS); 2889 } 2890 2891 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2892 { 2893 Mat_Product *product = C->product; 2894 Mat A, B; 2895 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2896 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2897 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2898 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2899 PetscBool flg; 2900 cusparseStatus_t stat; 2901 MatProductType ptype; 2902 MatMatCusparse *mmdata; 2903 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2904 cusparseSpMatDescr_t BmatSpDescr; 2905 #endif 2906 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2907 2908 PetscFunctionBegin; 2909 MatCheckProduct(C, 1); 2910 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2911 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2912 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2913 mmdata = (MatMatCusparse *)C->product->data; 2914 A = product->A; 2915 B = product->B; 2916 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2917 mmdata->reusesym = PETSC_FALSE; 2918 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2919 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2920 Cmat = Ccusp->mat; 2921 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2922 Ccsr = (CsrMatrix *)Cmat->mat; 2923 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2924 goto finalize; 2925 } 2926 if (!c->nz) goto finalize; 2927 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2928 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2929 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2930 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2931 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2932 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2933 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2934 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2935 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2936 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2937 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2938 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2939 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2940 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2941 2942 ptype = product->type; 2943 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2944 ptype = MATPRODUCT_AB; 2945 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2946 } 2947 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2948 ptype = MATPRODUCT_AB; 2949 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2950 } 2951 switch (ptype) { 2952 case MATPRODUCT_AB: 2953 Amat = Acusp->mat; 2954 Bmat = Bcusp->mat; 2955 break; 2956 case MATPRODUCT_AtB: 2957 Amat = Acusp->matTranspose; 2958 Bmat = Bcusp->mat; 2959 break; 2960 case MATPRODUCT_ABt: 2961 Amat = Acusp->mat; 2962 Bmat = Bcusp->matTranspose; 2963 break; 2964 default: 2965 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2966 } 2967 Cmat = Ccusp->mat; 2968 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2969 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2970 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2971 Acsr = (CsrMatrix *)Amat->mat; 2972 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2973 Ccsr = (CsrMatrix *)Cmat->mat; 2974 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2975 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2976 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2977 PetscCall(PetscLogGpuTimeBegin()); 2978 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2979 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2980 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2981 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2982 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2983 PetscCallCUSPARSE(stat); 2984 #else 2985 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2986 PetscCallCUSPARSE(stat); 2987 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2988 PetscCallCUSPARSE(stat); 2989 #endif 2990 #else 2991 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2992 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2993 PetscCallCUSPARSE(stat); 2994 #endif 2995 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2996 PetscCallCUDA(WaitForCUDA()); 2997 PetscCall(PetscLogGpuTimeEnd()); 2998 C->offloadmask = PETSC_OFFLOAD_GPU; 2999 finalize: 3000 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3001 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3002 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3003 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3004 c->reallocs = 0; 3005 C->info.mallocs += 0; 3006 C->info.nz_unneeded = 0; 3007 C->assembled = C->was_assembled = PETSC_TRUE; 3008 C->num_ass++; 3009 PetscFunctionReturn(PETSC_SUCCESS); 3010 } 3011 3012 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3013 { 3014 Mat_Product *product = C->product; 3015 Mat A, B; 3016 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3017 Mat_SeqAIJ *a, *b, *c; 3018 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3019 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3020 PetscInt i, j, m, n, k; 3021 PetscBool flg; 3022 cusparseStatus_t stat; 3023 MatProductType ptype; 3024 MatMatCusparse *mmdata; 3025 PetscLogDouble flops; 3026 PetscBool biscompressed, ciscompressed; 3027 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3028 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3029 cusparseSpMatDescr_t BmatSpDescr; 3030 #else 3031 int cnz; 3032 #endif 3033 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3034 3035 PetscFunctionBegin; 3036 MatCheckProduct(C, 1); 3037 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3038 A = product->A; 3039 B = product->B; 3040 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3041 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3042 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3043 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3044 a = (Mat_SeqAIJ *)A->data; 3045 b = (Mat_SeqAIJ *)B->data; 3046 /* product data */ 3047 PetscCall(PetscNew(&mmdata)); 3048 C->product->data = mmdata; 3049 C->product->destroy = MatDestroy_MatMatCusparse; 3050 3051 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3052 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3053 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3054 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3055 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3056 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3057 3058 ptype = product->type; 3059 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3060 ptype = MATPRODUCT_AB; 3061 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3062 } 3063 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3064 ptype = MATPRODUCT_AB; 3065 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3066 } 3067 biscompressed = PETSC_FALSE; 3068 ciscompressed = PETSC_FALSE; 3069 switch (ptype) { 3070 case MATPRODUCT_AB: 3071 m = A->rmap->n; 3072 n = B->cmap->n; 3073 k = A->cmap->n; 3074 Amat = Acusp->mat; 3075 Bmat = Bcusp->mat; 3076 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3077 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3078 break; 3079 case MATPRODUCT_AtB: 3080 m = A->cmap->n; 3081 n = B->cmap->n; 3082 k = A->rmap->n; 3083 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3084 Amat = Acusp->matTranspose; 3085 Bmat = Bcusp->mat; 3086 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3087 break; 3088 case MATPRODUCT_ABt: 3089 m = A->rmap->n; 3090 n = B->rmap->n; 3091 k = A->cmap->n; 3092 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3093 Amat = Acusp->mat; 3094 Bmat = Bcusp->matTranspose; 3095 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3096 break; 3097 default: 3098 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3099 } 3100 3101 /* create cusparse matrix */ 3102 PetscCall(MatSetSizes(C, m, n, m, n)); 3103 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3104 c = (Mat_SeqAIJ *)C->data; 3105 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3106 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3107 Ccsr = new CsrMatrix; 3108 3109 c->compressedrow.use = ciscompressed; 3110 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3111 c->compressedrow.nrows = a->compressedrow.nrows; 3112 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3113 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3114 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3115 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3116 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3117 } else { 3118 c->compressedrow.nrows = 0; 3119 c->compressedrow.i = NULL; 3120 c->compressedrow.rindex = NULL; 3121 Ccusp->workVector = NULL; 3122 Cmat->cprowIndices = NULL; 3123 } 3124 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3125 Ccusp->mat = Cmat; 3126 Ccusp->mat->mat = Ccsr; 3127 Ccsr->num_rows = Ccusp->nrows; 3128 Ccsr->num_cols = n; 3129 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3130 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3131 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3132 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3133 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3134 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3135 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3136 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3137 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3138 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3139 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3140 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3141 c->nz = 0; 3142 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3143 Ccsr->values = new THRUSTARRAY(c->nz); 3144 goto finalizesym; 3145 } 3146 3147 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3148 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3149 Acsr = (CsrMatrix *)Amat->mat; 3150 if (!biscompressed) { 3151 Bcsr = (CsrMatrix *)Bmat->mat; 3152 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3153 BmatSpDescr = Bmat->matDescr; 3154 #endif 3155 } else { /* we need to use row offsets for the full matrix */ 3156 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3157 Bcsr = new CsrMatrix; 3158 Bcsr->num_rows = B->rmap->n; 3159 Bcsr->num_cols = cBcsr->num_cols; 3160 Bcsr->num_entries = cBcsr->num_entries; 3161 Bcsr->column_indices = cBcsr->column_indices; 3162 Bcsr->values = cBcsr->values; 3163 if (!Bcusp->rowoffsets_gpu) { 3164 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3165 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3166 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3167 } 3168 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3169 mmdata->Bcsr = Bcsr; 3170 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3171 if (Bcsr->num_rows && Bcsr->num_cols) { 3172 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3173 PetscCallCUSPARSE(stat); 3174 } 3175 BmatSpDescr = mmdata->matSpBDescr; 3176 #endif 3177 } 3178 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3179 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3180 /* precompute flops count */ 3181 if (ptype == MATPRODUCT_AB) { 3182 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3183 const PetscInt st = a->i[i]; 3184 const PetscInt en = a->i[i + 1]; 3185 for (j = st; j < en; j++) { 3186 const PetscInt brow = a->j[j]; 3187 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3188 } 3189 } 3190 } else if (ptype == MATPRODUCT_AtB) { 3191 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3192 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3193 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3194 flops += (2. * anzi) * bnzi; 3195 } 3196 } else { /* TODO */ 3197 flops = 0.; 3198 } 3199 3200 mmdata->flops = flops; 3201 PetscCall(PetscLogGpuTimeBegin()); 3202 3203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3204 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3205 // cuda-12.2 requires non-null csrRowOffsets 3206 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3207 PetscCallCUSPARSE(stat); 3208 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3209 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3210 { 3211 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3212 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3213 */ 3214 void *dBuffer1 = NULL; 3215 void *dBuffer2 = NULL; 3216 void *dBuffer3 = NULL; 3217 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3218 size_t bufferSize1 = 0; 3219 size_t bufferSize2 = 0; 3220 size_t bufferSize3 = 0; 3221 size_t bufferSize4 = 0; 3222 size_t bufferSize5 = 0; 3223 3224 /* ask bufferSize1 bytes for external memory */ 3225 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3226 PetscCallCUSPARSE(stat); 3227 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3228 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3229 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3230 PetscCallCUSPARSE(stat); 3231 3232 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3233 PetscCallCUSPARSE(stat); 3234 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3235 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3236 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3237 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3238 PetscCallCUSPARSE(stat); 3239 PetscCallCUDA(cudaFree(dBuffer1)); 3240 PetscCallCUDA(cudaFree(dBuffer2)); 3241 3242 /* get matrix C non-zero entries C_nnz1 */ 3243 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3244 c->nz = (PetscInt)C_nnz1; 3245 /* allocate matrix C */ 3246 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3247 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3248 Ccsr->values = new THRUSTARRAY(c->nz); 3249 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3250 /* update matC with the new pointers */ 3251 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3252 PetscCallCUSPARSE(stat); 3253 3254 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3255 PetscCallCUSPARSE(stat); 3256 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3257 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3258 PetscCallCUSPARSE(stat); 3259 PetscCallCUDA(cudaFree(dBuffer3)); 3260 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3261 PetscCallCUSPARSE(stat); 3262 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3263 } 3264 #else 3265 size_t bufSize2; 3266 /* ask bufferSize bytes for external memory */ 3267 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3268 PetscCallCUSPARSE(stat); 3269 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3270 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3271 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3272 PetscCallCUSPARSE(stat); 3273 /* ask bufferSize again bytes for external memory */ 3274 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3275 PetscCallCUSPARSE(stat); 3276 /* The CUSPARSE documentation is not clear, nor the API 3277 We need both buffers to perform the operations properly! 3278 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3279 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3280 is stored in the descriptor! What a messy API... */ 3281 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3282 /* compute the intermediate product of A * B */ 3283 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3284 PetscCallCUSPARSE(stat); 3285 /* get matrix C non-zero entries C_nnz1 */ 3286 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3287 c->nz = (PetscInt)C_nnz1; 3288 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3289 mmdata->mmBufferSize / 1024)); 3290 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3291 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3292 Ccsr->values = new THRUSTARRAY(c->nz); 3293 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3294 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3295 PetscCallCUSPARSE(stat); 3296 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3297 PetscCallCUSPARSE(stat); 3298 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3299 #else 3300 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3301 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3302 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3303 PetscCallCUSPARSE(stat); 3304 c->nz = cnz; 3305 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3306 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3307 Ccsr->values = new THRUSTARRAY(c->nz); 3308 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3309 3310 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3311 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3312 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3313 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3314 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3315 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3316 PetscCallCUSPARSE(stat); 3317 #endif 3318 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3319 PetscCall(PetscLogGpuTimeEnd()); 3320 finalizesym: 3321 c->singlemalloc = PETSC_FALSE; 3322 c->free_a = PETSC_TRUE; 3323 c->free_ij = PETSC_TRUE; 3324 PetscCall(PetscMalloc1(m + 1, &c->i)); 3325 PetscCall(PetscMalloc1(c->nz, &c->j)); 3326 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3327 PetscInt *d_i = c->i; 3328 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3329 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3330 ii = *Ccsr->row_offsets; 3331 jj = *Ccsr->column_indices; 3332 if (ciscompressed) d_i = c->compressedrow.i; 3333 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3334 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3335 } else { 3336 PetscInt *d_i = c->i; 3337 if (ciscompressed) d_i = c->compressedrow.i; 3338 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3339 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3340 } 3341 if (ciscompressed) { /* need to expand host row offsets */ 3342 PetscInt r = 0; 3343 c->i[0] = 0; 3344 for (k = 0; k < c->compressedrow.nrows; k++) { 3345 const PetscInt next = c->compressedrow.rindex[k]; 3346 const PetscInt old = c->compressedrow.i[k]; 3347 for (; r < next; r++) c->i[r + 1] = old; 3348 } 3349 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3350 } 3351 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3352 PetscCall(PetscMalloc1(m, &c->ilen)); 3353 PetscCall(PetscMalloc1(m, &c->imax)); 3354 c->maxnz = c->nz; 3355 c->nonzerorowcnt = 0; 3356 c->rmax = 0; 3357 for (k = 0; k < m; k++) { 3358 const PetscInt nn = c->i[k + 1] - c->i[k]; 3359 c->ilen[k] = c->imax[k] = nn; 3360 c->nonzerorowcnt += (PetscInt) !!nn; 3361 c->rmax = PetscMax(c->rmax, nn); 3362 } 3363 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3364 PetscCall(PetscMalloc1(c->nz, &c->a)); 3365 Ccsr->num_entries = c->nz; 3366 3367 C->nonzerostate++; 3368 PetscCall(PetscLayoutSetUp(C->rmap)); 3369 PetscCall(PetscLayoutSetUp(C->cmap)); 3370 Ccusp->nonzerostate = C->nonzerostate; 3371 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3372 C->preallocated = PETSC_TRUE; 3373 C->assembled = PETSC_FALSE; 3374 C->was_assembled = PETSC_FALSE; 3375 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3376 mmdata->reusesym = PETSC_TRUE; 3377 C->offloadmask = PETSC_OFFLOAD_GPU; 3378 } 3379 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3380 PetscFunctionReturn(PETSC_SUCCESS); 3381 } 3382 3383 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3384 3385 /* handles sparse or dense B */ 3386 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3387 { 3388 Mat_Product *product = mat->product; 3389 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3390 3391 PetscFunctionBegin; 3392 MatCheckProduct(mat, 1); 3393 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3394 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3395 if (product->type == MATPRODUCT_ABC) { 3396 Ciscusp = PETSC_FALSE; 3397 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3398 } 3399 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3400 PetscBool usecpu = PETSC_FALSE; 3401 switch (product->type) { 3402 case MATPRODUCT_AB: 3403 if (product->api_user) { 3404 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3405 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3406 PetscOptionsEnd(); 3407 } else { 3408 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3409 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3410 PetscOptionsEnd(); 3411 } 3412 break; 3413 case MATPRODUCT_AtB: 3414 if (product->api_user) { 3415 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3416 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3417 PetscOptionsEnd(); 3418 } else { 3419 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3420 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3421 PetscOptionsEnd(); 3422 } 3423 break; 3424 case MATPRODUCT_PtAP: 3425 if (product->api_user) { 3426 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3427 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3428 PetscOptionsEnd(); 3429 } else { 3430 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3431 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3432 PetscOptionsEnd(); 3433 } 3434 break; 3435 case MATPRODUCT_RARt: 3436 if (product->api_user) { 3437 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3438 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3439 PetscOptionsEnd(); 3440 } else { 3441 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3442 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3443 PetscOptionsEnd(); 3444 } 3445 break; 3446 case MATPRODUCT_ABC: 3447 if (product->api_user) { 3448 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3449 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3450 PetscOptionsEnd(); 3451 } else { 3452 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3453 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3454 PetscOptionsEnd(); 3455 } 3456 break; 3457 default: 3458 break; 3459 } 3460 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3461 } 3462 /* dispatch */ 3463 if (isdense) { 3464 switch (product->type) { 3465 case MATPRODUCT_AB: 3466 case MATPRODUCT_AtB: 3467 case MATPRODUCT_ABt: 3468 case MATPRODUCT_PtAP: 3469 case MATPRODUCT_RARt: 3470 if (product->A->boundtocpu) { 3471 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3472 } else { 3473 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3474 } 3475 break; 3476 case MATPRODUCT_ABC: 3477 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3478 break; 3479 default: 3480 break; 3481 } 3482 } else if (Biscusp && Ciscusp) { 3483 switch (product->type) { 3484 case MATPRODUCT_AB: 3485 case MATPRODUCT_AtB: 3486 case MATPRODUCT_ABt: 3487 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3488 break; 3489 case MATPRODUCT_PtAP: 3490 case MATPRODUCT_RARt: 3491 case MATPRODUCT_ABC: 3492 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3493 break; 3494 default: 3495 break; 3496 } 3497 } else { /* fallback for AIJ */ 3498 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3499 } 3500 PetscFunctionReturn(PETSC_SUCCESS); 3501 } 3502 3503 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3504 { 3505 PetscFunctionBegin; 3506 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3507 PetscFunctionReturn(PETSC_SUCCESS); 3508 } 3509 3510 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3511 { 3512 PetscFunctionBegin; 3513 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3514 PetscFunctionReturn(PETSC_SUCCESS); 3515 } 3516 3517 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3518 { 3519 PetscFunctionBegin; 3520 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3521 PetscFunctionReturn(PETSC_SUCCESS); 3522 } 3523 3524 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3525 { 3526 PetscFunctionBegin; 3527 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3528 PetscFunctionReturn(PETSC_SUCCESS); 3529 } 3530 3531 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3532 { 3533 PetscFunctionBegin; 3534 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3535 PetscFunctionReturn(PETSC_SUCCESS); 3536 } 3537 3538 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3539 { 3540 int i = blockIdx.x * blockDim.x + threadIdx.x; 3541 if (i < n) y[idx[i]] += x[i]; 3542 } 3543 3544 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3545 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3546 { 3547 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3548 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3549 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3550 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3551 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3552 PetscBool compressed; 3553 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3554 PetscInt nx, ny; 3555 #endif 3556 3557 PetscFunctionBegin; 3558 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3559 if (!a->nz) { 3560 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3561 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3562 PetscFunctionReturn(PETSC_SUCCESS); 3563 } 3564 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3565 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3566 if (!trans) { 3567 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3568 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3569 } else { 3570 if (herm || !A->form_explicit_transpose) { 3571 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3572 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3573 } else { 3574 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3575 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3576 } 3577 } 3578 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3579 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3580 3581 try { 3582 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3583 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3584 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3585 3586 PetscCall(PetscLogGpuTimeBegin()); 3587 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3588 /* z = A x + beta y. 3589 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3590 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3591 */ 3592 xptr = xarray; 3593 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3594 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3595 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3596 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3597 allocated to accommodate different uses. So we get the length info directly from mat. 3598 */ 3599 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3600 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3601 nx = mat->num_cols; // since y = Ax 3602 ny = mat->num_rows; 3603 } 3604 #endif 3605 } else { 3606 /* z = A^T x + beta y 3607 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3608 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3609 */ 3610 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3611 dptr = zarray; 3612 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3613 if (compressed) { /* Scatter x to work vector */ 3614 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3615 3616 thrust::for_each( 3617 #if PetscDefined(HAVE_THRUST_ASYNC) 3618 thrust::cuda::par.on(PetscDefaultCudaStream), 3619 #endif 3620 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3621 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3622 } 3623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3624 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3625 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3626 nx = mat->num_rows; // since y = A^T x 3627 ny = mat->num_cols; 3628 } 3629 #endif 3630 } 3631 3632 /* csr_spmv does y = alpha op(A) x + beta y */ 3633 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3634 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3635 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3636 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3637 #else 3638 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3639 #endif 3640 3641 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3642 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3643 if (!matDescr) { 3644 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3645 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3646 } 3647 #endif 3648 3649 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3650 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3651 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3652 PetscCallCUSPARSE( 3653 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3654 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3655 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3656 PetscCallCUSPARSE( 3657 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3658 #endif 3659 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3660 } else { 3661 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3662 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3663 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3664 } 3665 3666 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3667 #else 3668 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3669 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3670 #endif 3671 } else { 3672 if (cusparsestruct->nrows) { 3673 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3674 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3675 #else 3676 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3677 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3678 #endif 3679 } 3680 } 3681 PetscCall(PetscLogGpuTimeEnd()); 3682 3683 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3684 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3685 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3686 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3687 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3688 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3689 } 3690 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3691 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3692 } 3693 3694 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3695 if (compressed) { 3696 PetscCall(PetscLogGpuTimeBegin()); 3697 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3698 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3699 prevent that. So I just add a ScatterAdd kernel. 3700 */ 3701 #if 0 3702 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3703 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3704 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3705 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3706 VecCUDAPlusEquals()); 3707 #else 3708 PetscInt n = matstruct->cprowIndices->size(); 3709 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3710 #endif 3711 PetscCall(PetscLogGpuTimeEnd()); 3712 } 3713 } else { 3714 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3715 } 3716 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3717 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3718 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3719 } catch (char *ex) { 3720 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3721 } 3722 if (yy) { 3723 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3724 } else { 3725 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3726 } 3727 PetscFunctionReturn(PETSC_SUCCESS); 3728 } 3729 3730 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3731 { 3732 PetscFunctionBegin; 3733 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3734 PetscFunctionReturn(PETSC_SUCCESS); 3735 } 3736 3737 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3738 { 3739 PetscFunctionBegin; 3740 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3741 PetscFunctionReturn(PETSC_SUCCESS); 3742 } 3743 3744 /*@ 3745 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3746 (the default parallel PETSc format). 3747 3748 Collective 3749 3750 Input Parameters: 3751 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3752 . m - number of rows 3753 . n - number of columns 3754 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3755 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3756 3757 Output Parameter: 3758 . A - the matrix 3759 3760 Level: intermediate 3761 3762 Notes: 3763 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3764 calculations. For good matrix assembly performance the user should preallocate the matrix 3765 storage by setting the parameter `nz` (or the array `nnz`). 3766 3767 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3768 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3769 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3770 3771 The AIJ format, also called 3772 compressed row storage, is fully compatible with standard Fortran 3773 storage. That is, the stored row and column indices can begin at 3774 either one (as in Fortran) or zero. 3775 3776 Specify the preallocated storage with either nz or nnz (not both). 3777 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3778 allocation. 3779 3780 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 3781 @*/ 3782 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3783 { 3784 PetscFunctionBegin; 3785 PetscCall(MatCreate(comm, A)); 3786 PetscCall(MatSetSizes(*A, m, n, m, n)); 3787 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3788 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3789 PetscFunctionReturn(PETSC_SUCCESS); 3790 } 3791 3792 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3793 { 3794 PetscFunctionBegin; 3795 if (A->factortype == MAT_FACTOR_NONE) { 3796 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3797 } else { 3798 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3799 } 3800 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3801 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3802 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3803 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3804 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3805 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3806 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3807 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3808 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3809 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3810 PetscCall(MatDestroy_SeqAIJ(A)); 3811 PetscFunctionReturn(PETSC_SUCCESS); 3812 } 3813 3814 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3815 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3816 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3817 { 3818 PetscFunctionBegin; 3819 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3820 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3821 PetscFunctionReturn(PETSC_SUCCESS); 3822 } 3823 3824 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3825 { 3826 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3827 Mat_SeqAIJCUSPARSE *cy; 3828 Mat_SeqAIJCUSPARSE *cx; 3829 PetscScalar *ay; 3830 const PetscScalar *ax; 3831 CsrMatrix *csry, *csrx; 3832 3833 PetscFunctionBegin; 3834 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3835 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3836 if (X->ops->axpy != Y->ops->axpy) { 3837 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3838 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3839 PetscFunctionReturn(PETSC_SUCCESS); 3840 } 3841 /* if we are here, it means both matrices are bound to GPU */ 3842 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3843 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3844 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3845 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3846 csry = (CsrMatrix *)cy->mat->mat; 3847 csrx = (CsrMatrix *)cx->mat->mat; 3848 /* see if we can turn this into a cublas axpy */ 3849 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3850 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3851 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3852 if (eq) str = SAME_NONZERO_PATTERN; 3853 } 3854 /* spgeam is buggy with one column */ 3855 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3856 3857 if (str == SUBSET_NONZERO_PATTERN) { 3858 PetscScalar b = 1.0; 3859 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3860 size_t bufferSize; 3861 void *buffer; 3862 #endif 3863 3864 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3865 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3866 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3867 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3868 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3869 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3870 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3871 PetscCall(PetscLogGpuTimeBegin()); 3872 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3873 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3874 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3875 PetscCall(PetscLogGpuTimeEnd()); 3876 PetscCallCUDA(cudaFree(buffer)); 3877 #else 3878 PetscCall(PetscLogGpuTimeBegin()); 3879 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3880 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3881 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3882 PetscCall(PetscLogGpuTimeEnd()); 3883 #endif 3884 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3885 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3886 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3887 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3888 } else if (str == SAME_NONZERO_PATTERN) { 3889 cublasHandle_t cublasv2handle; 3890 PetscBLASInt one = 1, bnz = 1; 3891 3892 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3893 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3894 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3895 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3896 PetscCall(PetscLogGpuTimeBegin()); 3897 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3898 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3899 PetscCall(PetscLogGpuTimeEnd()); 3900 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3901 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3902 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3903 } else { 3904 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3905 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3906 } 3907 PetscFunctionReturn(PETSC_SUCCESS); 3908 } 3909 3910 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3911 { 3912 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3913 PetscScalar *ay; 3914 cublasHandle_t cublasv2handle; 3915 PetscBLASInt one = 1, bnz = 1; 3916 3917 PetscFunctionBegin; 3918 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3919 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3920 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3921 PetscCall(PetscLogGpuTimeBegin()); 3922 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3923 PetscCall(PetscLogGpuFlops(bnz)); 3924 PetscCall(PetscLogGpuTimeEnd()); 3925 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3926 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3927 PetscFunctionReturn(PETSC_SUCCESS); 3928 } 3929 3930 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3931 { 3932 PetscBool both = PETSC_FALSE; 3933 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3934 3935 PetscFunctionBegin; 3936 if (A->factortype == MAT_FACTOR_NONE) { 3937 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3938 if (spptr->mat) { 3939 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3940 if (matrix->values) { 3941 both = PETSC_TRUE; 3942 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3943 } 3944 } 3945 if (spptr->matTranspose) { 3946 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3947 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3948 } 3949 } 3950 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3951 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3952 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3953 else A->offloadmask = PETSC_OFFLOAD_CPU; 3954 PetscFunctionReturn(PETSC_SUCCESS); 3955 } 3956 3957 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3958 { 3959 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3960 3961 PetscFunctionBegin; 3962 if (A->factortype != MAT_FACTOR_NONE) { 3963 A->boundtocpu = flg; 3964 PetscFunctionReturn(PETSC_SUCCESS); 3965 } 3966 if (flg) { 3967 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3968 3969 A->ops->scale = MatScale_SeqAIJ; 3970 A->ops->axpy = MatAXPY_SeqAIJ; 3971 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3972 A->ops->mult = MatMult_SeqAIJ; 3973 A->ops->multadd = MatMultAdd_SeqAIJ; 3974 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3975 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3976 A->ops->multhermitiantranspose = NULL; 3977 A->ops->multhermitiantransposeadd = NULL; 3978 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3979 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3980 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3981 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3982 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3983 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3984 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3985 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3986 } else { 3987 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3988 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3989 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3990 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3991 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3992 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3993 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3994 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3995 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3996 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3997 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3998 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3999 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4000 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4001 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4002 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4003 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4004 4005 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4006 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4007 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4008 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4009 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4010 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4011 } 4012 A->boundtocpu = flg; 4013 if (flg && a->inode.size) { 4014 a->inode.use = PETSC_TRUE; 4015 } else { 4016 a->inode.use = PETSC_FALSE; 4017 } 4018 PetscFunctionReturn(PETSC_SUCCESS); 4019 } 4020 4021 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4022 { 4023 Mat B; 4024 4025 PetscFunctionBegin; 4026 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4027 if (reuse == MAT_INITIAL_MATRIX) { 4028 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4029 } else if (reuse == MAT_REUSE_MATRIX) { 4030 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4031 } 4032 B = *newmat; 4033 4034 PetscCall(PetscFree(B->defaultvectype)); 4035 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4036 4037 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4038 if (B->factortype == MAT_FACTOR_NONE) { 4039 Mat_SeqAIJCUSPARSE *spptr; 4040 PetscCall(PetscNew(&spptr)); 4041 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4042 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4043 spptr->format = MAT_CUSPARSE_CSR; 4044 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4045 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4046 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4047 #else 4048 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4049 #endif 4050 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4051 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4052 #endif 4053 B->spptr = spptr; 4054 } else { 4055 Mat_SeqAIJCUSPARSETriFactors *spptr; 4056 4057 PetscCall(PetscNew(&spptr)); 4058 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4059 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4060 B->spptr = spptr; 4061 } 4062 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4063 } 4064 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4065 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4066 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4067 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4068 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4069 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4070 4071 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4072 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4073 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4074 #if defined(PETSC_HAVE_HYPRE) 4075 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4076 #endif 4077 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4078 PetscFunctionReturn(PETSC_SUCCESS); 4079 } 4080 4081 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4082 { 4083 PetscFunctionBegin; 4084 PetscCall(MatCreate_SeqAIJ(B)); 4085 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4086 PetscFunctionReturn(PETSC_SUCCESS); 4087 } 4088 4089 /*MC 4090 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4091 4092 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 4093 CSR, ELL, or Hybrid format. 4094 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4095 4096 Options Database Keys: 4097 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4098 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4099 Other options include ell (ellpack) or hyb (hybrid). 4100 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4101 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4102 4103 Level: beginner 4104 4105 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4106 M*/ 4107 4108 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4109 { 4110 PetscFunctionBegin; 4111 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4112 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4113 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4114 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4115 PetscFunctionReturn(PETSC_SUCCESS); 4116 } 4117 4118 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4119 { 4120 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4121 4122 PetscFunctionBegin; 4123 if (cusp) { 4124 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4125 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4126 delete cusp->workVector; 4127 delete cusp->rowoffsets_gpu; 4128 delete cusp->csr2csc_i; 4129 delete cusp->coords; 4130 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4131 PetscCall(PetscFree(mat->spptr)); 4132 } 4133 PetscFunctionReturn(PETSC_SUCCESS); 4134 } 4135 4136 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4137 { 4138 PetscFunctionBegin; 4139 if (*mat) { 4140 delete (*mat)->values; 4141 delete (*mat)->column_indices; 4142 delete (*mat)->row_offsets; 4143 delete *mat; 4144 *mat = 0; 4145 } 4146 PetscFunctionReturn(PETSC_SUCCESS); 4147 } 4148 4149 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4150 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4151 { 4152 PetscFunctionBegin; 4153 if (*trifactor) { 4154 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4155 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4156 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4157 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4158 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4160 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4161 #endif 4162 PetscCall(PetscFree(*trifactor)); 4163 } 4164 PetscFunctionReturn(PETSC_SUCCESS); 4165 } 4166 #endif 4167 4168 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4169 { 4170 CsrMatrix *mat; 4171 4172 PetscFunctionBegin; 4173 if (*matstruct) { 4174 if ((*matstruct)->mat) { 4175 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4176 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4177 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4178 #else 4179 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4180 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4181 #endif 4182 } else { 4183 mat = (CsrMatrix *)(*matstruct)->mat; 4184 PetscCall(CsrMatrix_Destroy(&mat)); 4185 } 4186 } 4187 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4188 delete (*matstruct)->cprowIndices; 4189 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4190 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4191 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4192 4193 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4194 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4195 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4196 4197 for (int i = 0; i < 3; i++) { 4198 if (mdata->cuSpMV[i].initialized) { 4199 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4200 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4201 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4202 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4203 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4204 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4205 #endif 4206 } 4207 } 4208 #endif 4209 delete *matstruct; 4210 *matstruct = NULL; 4211 } 4212 PetscFunctionReturn(PETSC_SUCCESS); 4213 } 4214 4215 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4216 { 4217 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4218 4219 PetscFunctionBegin; 4220 if (fs) { 4221 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4222 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4223 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4224 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4225 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4226 delete fs->workVector; 4227 fs->workVector = NULL; 4228 #endif 4229 delete fs->rpermIndices; 4230 delete fs->cpermIndices; 4231 fs->rpermIndices = NULL; 4232 fs->cpermIndices = NULL; 4233 fs->init_dev_prop = PETSC_FALSE; 4234 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4235 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4236 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4237 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4238 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4239 PetscCallCUDA(cudaFree(fs->csrVal)); 4240 PetscCallCUDA(cudaFree(fs->diag)); 4241 PetscCallCUDA(cudaFree(fs->X)); 4242 PetscCallCUDA(cudaFree(fs->Y)); 4243 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4244 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4245 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4246 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4247 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4248 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4249 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4250 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4251 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4252 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4253 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4254 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4255 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4256 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4257 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4258 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4259 PetscCall(PetscFree(fs->csrRowPtr_h)); 4260 PetscCall(PetscFree(fs->csrVal_h)); 4261 PetscCall(PetscFree(fs->diag_h)); 4262 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4263 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4264 #endif 4265 } 4266 PetscFunctionReturn(PETSC_SUCCESS); 4267 } 4268 4269 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4270 { 4271 PetscFunctionBegin; 4272 if (*trifactors) { 4273 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4274 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4275 PetscCall(PetscFree(*trifactors)); 4276 } 4277 PetscFunctionReturn(PETSC_SUCCESS); 4278 } 4279 4280 struct IJCompare { 4281 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4282 { 4283 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4284 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4285 return false; 4286 } 4287 }; 4288 4289 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4290 { 4291 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4292 4293 PetscFunctionBegin; 4294 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4295 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4296 if (destroy) { 4297 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4298 delete cusp->csr2csc_i; 4299 cusp->csr2csc_i = NULL; 4300 } 4301 A->transupdated = PETSC_FALSE; 4302 PetscFunctionReturn(PETSC_SUCCESS); 4303 } 4304 4305 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data) 4306 { 4307 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data; 4308 4309 PetscFunctionBegin; 4310 PetscCallCUDA(cudaFree(coo->perm)); 4311 PetscCallCUDA(cudaFree(coo->jmap)); 4312 PetscCall(PetscFree(coo)); 4313 PetscFunctionReturn(PETSC_SUCCESS); 4314 } 4315 4316 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4317 { 4318 PetscBool dev_ij = PETSC_FALSE; 4319 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4320 PetscInt *i, *j; 4321 PetscContainer container_h, container_d; 4322 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4323 4324 PetscFunctionBegin; 4325 // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter 4326 PetscCall(PetscGetMemType(coo_i, &mtype)); 4327 if (PetscMemTypeDevice(mtype)) { 4328 dev_ij = PETSC_TRUE; 4329 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4330 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4331 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4332 } else { 4333 i = coo_i; 4334 j = coo_j; 4335 } 4336 4337 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4338 if (dev_ij) PetscCall(PetscFree2(i, j)); 4339 mat->offloadmask = PETSC_OFFLOAD_CPU; 4340 // Create the GPU memory 4341 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4342 4343 // Copy the COO struct to device 4344 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4345 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4346 PetscCall(PetscMalloc1(1, &coo_d)); 4347 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4348 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4349 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4350 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4351 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4352 4353 // Put the COO struct in a container and then attach that to the matrix 4354 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 4355 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 4356 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4357 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 4358 PetscCall(PetscContainerDestroy(&container_d)); 4359 PetscFunctionReturn(PETSC_SUCCESS); 4360 } 4361 4362 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4363 { 4364 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4365 const PetscCount grid_size = gridDim.x * blockDim.x; 4366 for (; i < nnz; i += grid_size) { 4367 PetscScalar sum = 0.0; 4368 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4369 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4370 } 4371 } 4372 4373 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4374 { 4375 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4376 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4377 PetscCount Annz = seq->nz; 4378 PetscMemType memtype; 4379 const PetscScalar *v1 = v; 4380 PetscScalar *Aa; 4381 PetscContainer container; 4382 MatCOOStruct_SeqAIJ *coo; 4383 4384 PetscFunctionBegin; 4385 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4386 4387 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4388 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4389 4390 PetscCall(PetscGetMemType(v, &memtype)); 4391 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4392 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4393 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4394 } 4395 4396 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4397 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4398 4399 PetscCall(PetscLogGpuTimeBegin()); 4400 if (Annz) { 4401 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4402 PetscCallCUDA(cudaPeekAtLastError()); 4403 } 4404 PetscCall(PetscLogGpuTimeEnd()); 4405 4406 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4407 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4408 4409 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4410 PetscFunctionReturn(PETSC_SUCCESS); 4411 } 4412 4413 /*@C 4414 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4415 4416 Not Collective 4417 4418 Input Parameters: 4419 + A - the matrix 4420 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4421 4422 Output Parameters: 4423 + i - the CSR row pointers 4424 - j - the CSR column indices 4425 4426 Level: developer 4427 4428 Note: 4429 When compressed is true, the CSR structure does not contain empty rows 4430 4431 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4432 @*/ 4433 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4434 { 4435 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4436 CsrMatrix *csr; 4437 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4438 4439 PetscFunctionBegin; 4440 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4441 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4442 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4443 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4444 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4445 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4446 csr = (CsrMatrix *)cusp->mat->mat; 4447 if (i) { 4448 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4449 if (!cusp->rowoffsets_gpu) { 4450 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4451 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4452 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4453 } 4454 *i = cusp->rowoffsets_gpu->data().get(); 4455 } else *i = csr->row_offsets->data().get(); 4456 } 4457 if (j) *j = csr->column_indices->data().get(); 4458 PetscFunctionReturn(PETSC_SUCCESS); 4459 } 4460 4461 /*@C 4462 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4463 4464 Not Collective 4465 4466 Input Parameters: 4467 + A - the matrix 4468 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4469 . i - the CSR row pointers 4470 - j - the CSR column indices 4471 4472 Level: developer 4473 4474 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4475 @*/ 4476 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4477 { 4478 PetscFunctionBegin; 4479 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4480 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4481 if (i) *i = NULL; 4482 if (j) *j = NULL; 4483 (void)compressed; 4484 PetscFunctionReturn(PETSC_SUCCESS); 4485 } 4486 4487 /*@C 4488 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4489 4490 Not Collective 4491 4492 Input Parameter: 4493 . A - a `MATSEQAIJCUSPARSE` matrix 4494 4495 Output Parameter: 4496 . a - pointer to the device data 4497 4498 Level: developer 4499 4500 Note: 4501 May trigger host-device copies if up-to-date matrix data is on host 4502 4503 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4504 @*/ 4505 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4506 { 4507 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4508 CsrMatrix *csr; 4509 4510 PetscFunctionBegin; 4511 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4512 PetscAssertPointer(a, 2); 4513 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4514 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4515 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4516 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4517 csr = (CsrMatrix *)cusp->mat->mat; 4518 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4519 *a = csr->values->data().get(); 4520 PetscFunctionReturn(PETSC_SUCCESS); 4521 } 4522 4523 /*@C 4524 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4525 4526 Not Collective 4527 4528 Input Parameters: 4529 + A - a `MATSEQAIJCUSPARSE` matrix 4530 - a - pointer to the device data 4531 4532 Level: developer 4533 4534 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4535 @*/ 4536 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4537 { 4538 PetscFunctionBegin; 4539 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4540 PetscAssertPointer(a, 2); 4541 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4542 *a = NULL; 4543 PetscFunctionReturn(PETSC_SUCCESS); 4544 } 4545 4546 /*@C 4547 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4548 4549 Not Collective 4550 4551 Input Parameter: 4552 . A - a `MATSEQAIJCUSPARSE` matrix 4553 4554 Output Parameter: 4555 . a - pointer to the device data 4556 4557 Level: developer 4558 4559 Note: 4560 May trigger host-device copies if up-to-date matrix data is on host 4561 4562 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4563 @*/ 4564 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4565 { 4566 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4567 CsrMatrix *csr; 4568 4569 PetscFunctionBegin; 4570 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4571 PetscAssertPointer(a, 2); 4572 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4573 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4574 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4575 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4576 csr = (CsrMatrix *)cusp->mat->mat; 4577 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4578 *a = csr->values->data().get(); 4579 A->offloadmask = PETSC_OFFLOAD_GPU; 4580 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4581 PetscFunctionReturn(PETSC_SUCCESS); 4582 } 4583 /*@C 4584 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4585 4586 Not Collective 4587 4588 Input Parameters: 4589 + A - a `MATSEQAIJCUSPARSE` matrix 4590 - a - pointer to the device data 4591 4592 Level: developer 4593 4594 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4595 @*/ 4596 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4597 { 4598 PetscFunctionBegin; 4599 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4600 PetscAssertPointer(a, 2); 4601 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4602 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4603 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4604 *a = NULL; 4605 PetscFunctionReturn(PETSC_SUCCESS); 4606 } 4607 4608 /*@C 4609 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4610 4611 Not Collective 4612 4613 Input Parameter: 4614 . A - a `MATSEQAIJCUSPARSE` matrix 4615 4616 Output Parameter: 4617 . a - pointer to the device data 4618 4619 Level: developer 4620 4621 Note: 4622 Does not trigger host-device copies and flags data validity on the GPU 4623 4624 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4625 @*/ 4626 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4627 { 4628 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4629 CsrMatrix *csr; 4630 4631 PetscFunctionBegin; 4632 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4633 PetscAssertPointer(a, 2); 4634 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4635 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4636 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4637 csr = (CsrMatrix *)cusp->mat->mat; 4638 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4639 *a = csr->values->data().get(); 4640 A->offloadmask = PETSC_OFFLOAD_GPU; 4641 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4642 PetscFunctionReturn(PETSC_SUCCESS); 4643 } 4644 4645 /*@C 4646 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4647 4648 Not Collective 4649 4650 Input Parameters: 4651 + A - a `MATSEQAIJCUSPARSE` matrix 4652 - a - pointer to the device data 4653 4654 Level: developer 4655 4656 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4657 @*/ 4658 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4659 { 4660 PetscFunctionBegin; 4661 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4662 PetscAssertPointer(a, 2); 4663 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4664 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4665 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4666 *a = NULL; 4667 PetscFunctionReturn(PETSC_SUCCESS); 4668 } 4669 4670 struct IJCompare4 { 4671 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4672 { 4673 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4674 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4675 return false; 4676 } 4677 }; 4678 4679 struct Shift { 4680 int _shift; 4681 4682 Shift(int shift) : _shift(shift) { } 4683 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4684 }; 4685 4686 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4687 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4688 { 4689 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4690 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4691 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4692 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4693 PetscInt Annz, Bnnz; 4694 cusparseStatus_t stat; 4695 PetscInt i, m, n, zero = 0; 4696 4697 PetscFunctionBegin; 4698 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4699 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4700 PetscAssertPointer(C, 4); 4701 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4702 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4703 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4704 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4705 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4706 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4707 if (reuse == MAT_INITIAL_MATRIX) { 4708 m = A->rmap->n; 4709 n = A->cmap->n + B->cmap->n; 4710 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4711 PetscCall(MatSetSizes(*C, m, n, m, n)); 4712 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4713 c = (Mat_SeqAIJ *)(*C)->data; 4714 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4715 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4716 Ccsr = new CsrMatrix; 4717 Cmat->cprowIndices = NULL; 4718 c->compressedrow.use = PETSC_FALSE; 4719 c->compressedrow.nrows = 0; 4720 c->compressedrow.i = NULL; 4721 c->compressedrow.rindex = NULL; 4722 Ccusp->workVector = NULL; 4723 Ccusp->nrows = m; 4724 Ccusp->mat = Cmat; 4725 Ccusp->mat->mat = Ccsr; 4726 Ccsr->num_rows = m; 4727 Ccsr->num_cols = n; 4728 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4729 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4730 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4731 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4732 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4733 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4734 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4735 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4736 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4737 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4738 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4739 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4740 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4741 4742 Acsr = (CsrMatrix *)Acusp->mat->mat; 4743 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4744 Annz = (PetscInt)Acsr->column_indices->size(); 4745 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4746 c->nz = Annz + Bnnz; 4747 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4748 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4749 Ccsr->values = new THRUSTARRAY(c->nz); 4750 Ccsr->num_entries = c->nz; 4751 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4752 if (c->nz) { 4753 auto Acoo = new THRUSTINTARRAY32(Annz); 4754 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4755 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4756 THRUSTINTARRAY32 *Aroff, *Broff; 4757 4758 if (a->compressedrow.use) { /* need full row offset */ 4759 if (!Acusp->rowoffsets_gpu) { 4760 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4761 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4762 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4763 } 4764 Aroff = Acusp->rowoffsets_gpu; 4765 } else Aroff = Acsr->row_offsets; 4766 if (b->compressedrow.use) { /* need full row offset */ 4767 if (!Bcusp->rowoffsets_gpu) { 4768 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4769 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4770 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4771 } 4772 Broff = Bcusp->rowoffsets_gpu; 4773 } else Broff = Bcsr->row_offsets; 4774 PetscCall(PetscLogGpuTimeBegin()); 4775 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4776 PetscCallCUSPARSE(stat); 4777 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4778 PetscCallCUSPARSE(stat); 4779 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4780 auto Aperm = thrust::make_constant_iterator(1); 4781 auto Bperm = thrust::make_constant_iterator(0); 4782 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4783 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4784 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4785 #else 4786 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4787 auto Bcib = Bcsr->column_indices->begin(); 4788 auto Bcie = Bcsr->column_indices->end(); 4789 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4790 #endif 4791 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4792 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4793 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4794 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4795 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4796 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4797 auto p1 = Ccusp->coords->begin(); 4798 auto p2 = Ccusp->coords->begin(); 4799 thrust::advance(p2, Annz); 4800 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4801 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4802 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4803 #endif 4804 auto cci = thrust::make_counting_iterator(zero); 4805 auto cce = thrust::make_counting_iterator(c->nz); 4806 #if 0 //Errors on SUMMIT cuda 11.1.0 4807 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4808 #else 4809 auto pred = thrust::identity<int>(); 4810 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4811 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4812 #endif 4813 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4814 PetscCallCUSPARSE(stat); 4815 PetscCall(PetscLogGpuTimeEnd()); 4816 delete wPerm; 4817 delete Acoo; 4818 delete Bcoo; 4819 delete Ccoo; 4820 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4821 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4822 PetscCallCUSPARSE(stat); 4823 #endif 4824 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4825 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4826 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4827 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4828 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4829 CsrMatrix *CcsrT = new CsrMatrix; 4830 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4831 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4832 4833 (*C)->form_explicit_transpose = PETSC_TRUE; 4834 (*C)->transupdated = PETSC_TRUE; 4835 Ccusp->rowoffsets_gpu = NULL; 4836 CmatT->cprowIndices = NULL; 4837 CmatT->mat = CcsrT; 4838 CcsrT->num_rows = n; 4839 CcsrT->num_cols = m; 4840 CcsrT->num_entries = c->nz; 4841 4842 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4843 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4844 CcsrT->values = new THRUSTARRAY(c->nz); 4845 4846 PetscCall(PetscLogGpuTimeBegin()); 4847 auto rT = CcsrT->row_offsets->begin(); 4848 if (AT) { 4849 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4850 thrust::advance(rT, -1); 4851 } 4852 if (BT) { 4853 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4854 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4855 thrust::copy(titb, tite, rT); 4856 } 4857 auto cT = CcsrT->column_indices->begin(); 4858 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4859 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4860 auto vT = CcsrT->values->begin(); 4861 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4862 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4863 PetscCall(PetscLogGpuTimeEnd()); 4864 4865 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4866 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4867 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4868 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4869 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4870 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4871 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4872 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4873 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4874 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4875 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4876 PetscCallCUSPARSE(stat); 4877 #endif 4878 Ccusp->matTranspose = CmatT; 4879 } 4880 } 4881 4882 c->singlemalloc = PETSC_FALSE; 4883 c->free_a = PETSC_TRUE; 4884 c->free_ij = PETSC_TRUE; 4885 PetscCall(PetscMalloc1(m + 1, &c->i)); 4886 PetscCall(PetscMalloc1(c->nz, &c->j)); 4887 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4888 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4889 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4890 ii = *Ccsr->row_offsets; 4891 jj = *Ccsr->column_indices; 4892 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4893 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4894 } else { 4895 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4896 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4897 } 4898 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4899 PetscCall(PetscMalloc1(m, &c->ilen)); 4900 PetscCall(PetscMalloc1(m, &c->imax)); 4901 c->maxnz = c->nz; 4902 c->nonzerorowcnt = 0; 4903 c->rmax = 0; 4904 for (i = 0; i < m; i++) { 4905 const PetscInt nn = c->i[i + 1] - c->i[i]; 4906 c->ilen[i] = c->imax[i] = nn; 4907 c->nonzerorowcnt += (PetscInt) !!nn; 4908 c->rmax = PetscMax(c->rmax, nn); 4909 } 4910 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4911 PetscCall(PetscMalloc1(c->nz, &c->a)); 4912 (*C)->nonzerostate++; 4913 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4914 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4915 Ccusp->nonzerostate = (*C)->nonzerostate; 4916 (*C)->preallocated = PETSC_TRUE; 4917 } else { 4918 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4919 c = (Mat_SeqAIJ *)(*C)->data; 4920 if (c->nz) { 4921 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4922 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4923 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4924 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4925 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4926 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4927 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4928 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4929 Acsr = (CsrMatrix *)Acusp->mat->mat; 4930 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4931 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4932 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4933 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4934 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4935 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4936 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4937 auto pmid = Ccusp->coords->begin(); 4938 thrust::advance(pmid, Acsr->num_entries); 4939 PetscCall(PetscLogGpuTimeBegin()); 4940 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4941 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4942 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4943 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4944 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4945 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4946 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4947 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4948 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4949 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4950 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4951 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4952 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4953 auto vT = CcsrT->values->begin(); 4954 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4955 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4956 (*C)->transupdated = PETSC_TRUE; 4957 } 4958 PetscCall(PetscLogGpuTimeEnd()); 4959 } 4960 } 4961 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4962 (*C)->assembled = PETSC_TRUE; 4963 (*C)->was_assembled = PETSC_FALSE; 4964 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4965 PetscFunctionReturn(PETSC_SUCCESS); 4966 } 4967 4968 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4969 { 4970 bool dmem; 4971 const PetscScalar *av; 4972 4973 PetscFunctionBegin; 4974 dmem = isCudaMem(v); 4975 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4976 if (n && idx) { 4977 THRUSTINTARRAY widx(n); 4978 widx.assign(idx, idx + n); 4979 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4980 4981 THRUSTARRAY *w = NULL; 4982 thrust::device_ptr<PetscScalar> dv; 4983 if (dmem) { 4984 dv = thrust::device_pointer_cast(v); 4985 } else { 4986 w = new THRUSTARRAY(n); 4987 dv = w->data(); 4988 } 4989 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4990 4991 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4992 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4993 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4994 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4995 delete w; 4996 } else { 4997 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4998 } 4999 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5000 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5001 PetscFunctionReturn(PETSC_SUCCESS); 5002 } 5003