1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 28 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 29 30 typedef enum { 31 CUSPARSE_MV_ALG_DEFAULT = 0, 32 CUSPARSE_COOMV_ALG = 1, 33 CUSPARSE_CSRMV_ALG1 = 2, 34 CUSPARSE_CSRMV_ALG2 = 3 35 } cusparseSpMVAlg_t; 36 37 typedef enum { 38 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 39 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 40 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 41 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 42 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 43 CUSPARSE_SPMM_ALG_DEFAULT = 0, 44 CUSPARSE_SPMM_COO_ALG1 = 1, 45 CUSPARSE_SPMM_COO_ALG2 = 2, 46 CUSPARSE_SPMM_COO_ALG3 = 3, 47 CUSPARSE_SPMM_COO_ALG4 = 5, 48 CUSPARSE_SPMM_CSR_ALG1 = 4, 49 CUSPARSE_SPMM_CSR_ALG2 = 6, 50 } cusparseSpMMAlg_t; 51 52 typedef enum { 53 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 54 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 55 } cusparseCsr2CscAlg_t; 56 */ 57 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 58 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 60 #endif 61 62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 66 #if PETSC_PKG_CUDA_VERSION_LT(11, 3, 0) 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 72 #endif 73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 83 84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 88 89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 91 92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 95 96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 97 { 98 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 99 100 PetscFunctionBegin; 101 switch (op) { 102 case MAT_CUSPARSE_MULT: 103 cusparsestruct->format = format; 104 break; 105 case MAT_CUSPARSE_ALL: 106 cusparsestruct->format = format; 107 break; 108 default: 109 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 110 } 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 /*@ 115 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 116 operation. Only the `MatMult()` operation can use different GPU storage formats 117 118 Not Collective 119 120 Input Parameters: 121 + A - Matrix of type `MATSEQAIJCUSPARSE` 122 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 123 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 124 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 125 126 Level: intermediate 127 128 .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129 @*/ 130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 131 { 132 PetscFunctionBegin; 133 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 134 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 139 { 140 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 141 142 PetscFunctionBegin; 143 cusparsestruct->use_cpu_solve = use_cpu; 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 /*@ 148 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 149 150 Input Parameters: 151 + A - Matrix of type `MATSEQAIJCUSPARSE` 152 - use_cpu - set flag for using the built-in CPU `MatSolve()` 153 154 Level: intermediate 155 156 Note: 157 The cuSparse LU solver currently computes the factors with the built-in CPU method 158 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160 161 .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 162 @*/ 163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 164 { 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 167 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 168 PetscFunctionReturn(PETSC_SUCCESS); 169 } 170 171 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 172 { 173 PetscFunctionBegin; 174 switch (op) { 175 case MAT_FORM_EXPLICIT_TRANSPOSE: 176 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 177 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 178 A->form_explicit_transpose = flg; 179 break; 180 default: 181 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 182 break; 183 } 184 PetscFunctionReturn(PETSC_SUCCESS); 185 } 186 187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 188 { 189 MatCUSPARSEStorageFormat format; 190 PetscBool flg; 191 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 192 193 PetscFunctionBegin; 194 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 195 if (A->factortype == MAT_FACTOR_NONE) { 196 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 197 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 198 199 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 200 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 201 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 202 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 204 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 205 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 206 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 207 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 208 #else 209 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 210 #endif 211 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 212 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 213 214 PetscCall( 215 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 216 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 217 #endif 218 } 219 PetscOptionsHeadEnd(); 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 225 { 226 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 227 PetscInt m = A->rmap->n; 228 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 229 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 230 const MatScalar *Aa = a->a; 231 PetscInt *Mi, *Mj, Mnz; 232 PetscScalar *Ma; 233 234 PetscFunctionBegin; 235 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 236 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 237 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 238 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 239 PetscCall(PetscMalloc1(m + 1, &Mi)); 240 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 241 PetscCall(PetscMalloc1(Mnz, &Ma)); 242 Mi[0] = 0; 243 for (PetscInt i = 0; i < m; i++) { 244 PetscInt llen = Ai[i + 1] - Ai[i]; 245 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 246 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 247 Mj[Mi[i] + llen] = i; // diagonal entry 248 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 249 Mi[i + 1] = Mi[i] + llen + ulen; 250 } 251 // Copy M (L,U) from host to device 252 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1))); 253 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz)); 254 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz)); 255 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice)); 256 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice)); 257 258 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 259 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 260 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 261 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 262 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 263 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 264 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 265 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 266 267 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 268 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 269 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 270 271 fillMode = CUSPARSE_FILL_MODE_UPPER; 272 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 273 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 274 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 275 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 276 277 // Allocate work vectors in SpSv 278 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m)); 279 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m)); 280 281 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 282 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 283 284 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 285 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 286 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 287 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 288 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 289 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 290 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 291 292 // Record for reuse 293 fs->csrRowPtr_h = Mi; 294 fs->csrVal_h = Ma; 295 PetscCall(PetscFree(Mj)); 296 } 297 // Copy the value 298 Mi = fs->csrRowPtr_h; 299 Ma = fs->csrVal_h; 300 Mnz = Mi[m]; 301 for (PetscInt i = 0; i < m; i++) { 302 PetscInt llen = Ai[i + 1] - Ai[i]; 303 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 304 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 305 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 306 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 307 } 308 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 309 310 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 311 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 312 313 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 314 315 // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve 316 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 317 } 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 #else 321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 322 { 323 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 324 PetscInt n = A->rmap->n; 325 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 326 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 327 const PetscInt *ai = a->i, *aj = a->j, *vi; 328 const MatScalar *aa = a->a, *v; 329 PetscInt *AiLo, *AjLo; 330 PetscInt i, nz, nzLower, offset, rowOffset; 331 332 PetscFunctionBegin; 333 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 334 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 335 try { 336 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 337 nzLower = n + ai[n] - ai[1]; 338 if (!loTriFactor) { 339 PetscScalar *AALo; 340 341 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 342 343 /* Allocate Space for the lower triangular matrix */ 344 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 345 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 346 347 /* Fill the lower triangular matrix */ 348 AiLo[0] = (PetscInt)0; 349 AiLo[n] = nzLower; 350 AjLo[0] = (PetscInt)0; 351 AALo[0] = (MatScalar)1.0; 352 v = aa; 353 vi = aj; 354 offset = 1; 355 rowOffset = 1; 356 for (i = 1; i < n; i++) { 357 nz = ai[i + 1] - ai[i]; 358 /* additional 1 for the term on the diagonal */ 359 AiLo[i] = rowOffset; 360 rowOffset += nz + 1; 361 362 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 363 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 364 365 offset += nz; 366 AjLo[offset] = (PetscInt)i; 367 AALo[offset] = (MatScalar)1.0; 368 offset += 1; 369 370 v += nz; 371 vi += nz; 372 } 373 374 /* allocate space for the triangular factor information */ 375 PetscCall(PetscNew(&loTriFactor)); 376 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 377 /* Create the matrix description */ 378 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 379 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 380 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 381 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 382 #else 383 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 384 #endif 385 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 386 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 387 388 /* set the operation */ 389 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 390 391 /* set the matrix */ 392 loTriFactor->csrMat = new CsrMatrix; 393 loTriFactor->csrMat->num_rows = n; 394 loTriFactor->csrMat->num_cols = n; 395 loTriFactor->csrMat->num_entries = nzLower; 396 397 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 398 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 399 400 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 401 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 402 403 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 404 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 405 406 /* Create the solve analysis information */ 407 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 408 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 409 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 410 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 411 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 412 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 413 #endif 414 415 /* perform the solve analysis */ 416 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 417 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 418 PetscCallCUDA(WaitForCUDA()); 419 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 420 421 /* assign the pointer */ 422 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 423 loTriFactor->AA_h = AALo; 424 PetscCallCUDA(cudaFreeHost(AiLo)); 425 PetscCallCUDA(cudaFreeHost(AjLo)); 426 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 427 } else { /* update values only */ 428 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 429 /* Fill the lower triangular matrix */ 430 loTriFactor->AA_h[0] = 1.0; 431 v = aa; 432 vi = aj; 433 offset = 1; 434 for (i = 1; i < n; i++) { 435 nz = ai[i + 1] - ai[i]; 436 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 437 offset += nz; 438 loTriFactor->AA_h[offset] = 1.0; 439 offset += 1; 440 v += nz; 441 } 442 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 443 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 444 } 445 } catch (char *ex) { 446 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 447 } 448 } 449 PetscFunctionReturn(PETSC_SUCCESS); 450 } 451 452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 453 { 454 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 455 PetscInt n = A->rmap->n; 456 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 457 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 458 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 459 const MatScalar *aa = a->a, *v; 460 PetscInt *AiUp, *AjUp; 461 PetscInt i, nz, nzUpper, offset; 462 463 PetscFunctionBegin; 464 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 465 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 466 try { 467 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 468 nzUpper = adiag[0] - adiag[n]; 469 if (!upTriFactor) { 470 PetscScalar *AAUp; 471 472 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 473 474 /* Allocate Space for the upper triangular matrix */ 475 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 476 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 477 478 /* Fill the upper triangular matrix */ 479 AiUp[0] = (PetscInt)0; 480 AiUp[n] = nzUpper; 481 offset = nzUpper; 482 for (i = n - 1; i >= 0; i--) { 483 v = aa + adiag[i + 1] + 1; 484 vi = aj + adiag[i + 1] + 1; 485 486 /* number of elements NOT on the diagonal */ 487 nz = adiag[i] - adiag[i + 1] - 1; 488 489 /* decrement the offset */ 490 offset -= (nz + 1); 491 492 /* first, set the diagonal elements */ 493 AjUp[offset] = (PetscInt)i; 494 AAUp[offset] = (MatScalar)1. / v[nz]; 495 AiUp[i] = AiUp[i + 1] - (nz + 1); 496 497 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 498 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 499 } 500 501 /* allocate space for the triangular factor information */ 502 PetscCall(PetscNew(&upTriFactor)); 503 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 504 505 /* Create the matrix description */ 506 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 507 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 508 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 509 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 510 #else 511 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 512 #endif 513 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 514 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 515 516 /* set the operation */ 517 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 518 519 /* set the matrix */ 520 upTriFactor->csrMat = new CsrMatrix; 521 upTriFactor->csrMat->num_rows = n; 522 upTriFactor->csrMat->num_cols = n; 523 upTriFactor->csrMat->num_entries = nzUpper; 524 525 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 526 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 527 528 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 529 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 530 531 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 532 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 533 534 /* Create the solve analysis information */ 535 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 536 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 537 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 538 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 539 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 540 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 541 #endif 542 543 /* perform the solve analysis */ 544 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 545 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 546 547 PetscCallCUDA(WaitForCUDA()); 548 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 549 550 /* assign the pointer */ 551 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 552 upTriFactor->AA_h = AAUp; 553 PetscCallCUDA(cudaFreeHost(AiUp)); 554 PetscCallCUDA(cudaFreeHost(AjUp)); 555 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 556 } else { 557 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 558 /* Fill the upper triangular matrix */ 559 offset = nzUpper; 560 for (i = n - 1; i >= 0; i--) { 561 v = aa + adiag[i + 1] + 1; 562 563 /* number of elements NOT on the diagonal */ 564 nz = adiag[i] - adiag[i + 1] - 1; 565 566 /* decrement the offset */ 567 offset -= (nz + 1); 568 569 /* first, set the diagonal elements */ 570 upTriFactor->AA_h[offset] = 1. / v[nz]; 571 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 572 } 573 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 574 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 575 } 576 } catch (char *ex) { 577 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 578 } 579 } 580 PetscFunctionReturn(PETSC_SUCCESS); 581 } 582 #endif 583 584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 585 { 586 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 587 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 588 IS isrow = a->row, iscol = a->icol; 589 PetscBool row_identity, col_identity; 590 PetscInt n = A->rmap->n; 591 592 PetscFunctionBegin; 593 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 594 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 595 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 596 #else 597 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 598 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 599 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 600 #endif 601 602 cusparseTriFactors->nnz = a->nz; 603 604 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 605 /* lower triangular indices */ 606 PetscCall(ISIdentity(isrow, &row_identity)); 607 if (!row_identity && !cusparseTriFactors->rpermIndices) { 608 const PetscInt *r; 609 610 PetscCall(ISGetIndices(isrow, &r)); 611 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 612 cusparseTriFactors->rpermIndices->assign(r, r + n); 613 PetscCall(ISRestoreIndices(isrow, &r)); 614 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 615 } 616 617 /* upper triangular indices */ 618 PetscCall(ISIdentity(iscol, &col_identity)); 619 if (!col_identity && !cusparseTriFactors->cpermIndices) { 620 const PetscInt *c; 621 622 PetscCall(ISGetIndices(iscol, &c)); 623 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 624 cusparseTriFactors->cpermIndices->assign(c, c + n); 625 PetscCall(ISRestoreIndices(iscol, &c)); 626 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 627 } 628 PetscFunctionReturn(PETSC_SUCCESS); 629 } 630 631 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 633 { 634 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 635 PetscInt m = A->rmap->n; 636 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 637 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 638 const MatScalar *Aa = a->a; 639 PetscInt *Mj, Mnz; 640 PetscScalar *Ma, *D; 641 642 PetscFunctionBegin; 643 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 644 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 645 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 646 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 647 Mnz = Ai[m]; // Unz (with the unit diagonal) 648 PetscCall(PetscMalloc1(Mnz, &Ma)); 649 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 650 PetscCall(PetscMalloc1(m, &D)); // the diagonal 651 for (PetscInt i = 0; i < m; i++) { 652 PetscInt ulen = Ai[i + 1] - Ai[i]; 653 Mj[Ai[i]] = i; // diagonal entry 654 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 655 } 656 // Copy M (U) from host to device 657 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1))); 658 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz)); 659 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz)); 660 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m)); 661 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 662 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 663 664 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 665 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 666 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 667 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 668 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 669 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 670 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 671 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 672 673 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 674 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 675 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 676 677 // Allocate work vectors in SpSv 678 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m)); 679 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m)); 680 681 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 682 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 683 684 // Query buffer sizes for SpSV and then allocate buffers 685 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 686 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 687 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 688 689 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut sovle uses the same matrix (spMatDescr_U), but different descr and buffer 690 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 691 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 692 693 // Record for reuse 694 fs->csrVal_h = Ma; 695 fs->diag_h = D; 696 PetscCall(PetscFree(Mj)); 697 } 698 // Copy the value 699 Ma = fs->csrVal_h; 700 D = fs->diag_h; 701 Mnz = Ai[m]; 702 for (PetscInt i = 0; i < m; i++) { 703 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 704 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 705 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 706 } 707 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 708 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 709 710 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 711 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 712 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 713 } 714 PetscFunctionReturn(PETSC_SUCCESS); 715 } 716 717 // Solve Ut D U x = b 718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 719 { 720 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 721 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 722 const PetscScalar *barray; 723 PetscScalar *xarray; 724 thrust::device_ptr<const PetscScalar> bGPU; 725 thrust::device_ptr<PetscScalar> xGPU; 726 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 727 PetscInt m = A->rmap->n; 728 729 PetscFunctionBegin; 730 PetscCall(PetscLogGpuTimeBegin()); 731 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 732 PetscCall(VecCUDAGetArrayRead(b, &barray)); 733 xGPU = thrust::device_pointer_cast(xarray); 734 bGPU = thrust::device_pointer_cast(barray); 735 736 // Reorder b with the row permutation if needed, and wrap the result in fs->X 737 if (fs->rpermIndices) { 738 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 739 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 740 } else { 741 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 742 } 743 744 // Solve Ut Y = X 745 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 746 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 747 748 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 749 // It is basically a vector element-wise multiplication, but cublas does not have it! 750 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 751 752 // Solve U X = Y 753 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 754 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 755 } else { 756 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 757 } 758 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 759 760 // Reorder X with the column permutation if needed, and put the result back to x 761 if (fs->cpermIndices) { 762 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 763 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 764 } 765 766 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 767 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 768 PetscCall(PetscLogGpuTimeEnd()); 769 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 770 PetscFunctionReturn(PETSC_SUCCESS); 771 } 772 #else 773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 774 { 775 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 776 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 777 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 778 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 779 PetscInt *AiUp, *AjUp; 780 PetscScalar *AAUp; 781 PetscScalar *AALo; 782 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 783 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 784 const PetscInt *ai = b->i, *aj = b->j, *vj; 785 const MatScalar *aa = b->a, *v; 786 787 PetscFunctionBegin; 788 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 789 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 790 try { 791 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 792 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 793 if (!upTriFactor && !loTriFactor) { 794 /* Allocate Space for the upper triangular matrix */ 795 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 796 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 797 798 /* Fill the upper triangular matrix */ 799 AiUp[0] = (PetscInt)0; 800 AiUp[n] = nzUpper; 801 offset = 0; 802 for (i = 0; i < n; i++) { 803 /* set the pointers */ 804 v = aa + ai[i]; 805 vj = aj + ai[i]; 806 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 807 808 /* first, set the diagonal elements */ 809 AjUp[offset] = (PetscInt)i; 810 AAUp[offset] = (MatScalar)1.0 / v[nz]; 811 AiUp[i] = offset; 812 AALo[offset] = (MatScalar)1.0 / v[nz]; 813 814 offset += 1; 815 if (nz > 0) { 816 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 817 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 818 for (j = offset; j < offset + nz; j++) { 819 AAUp[j] = -AAUp[j]; 820 AALo[j] = AAUp[j] / v[nz]; 821 } 822 offset += nz; 823 } 824 } 825 826 /* allocate space for the triangular factor information */ 827 PetscCall(PetscNew(&upTriFactor)); 828 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 829 830 /* Create the matrix description */ 831 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 832 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 833 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 834 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 835 #else 836 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 837 #endif 838 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 839 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 840 841 /* set the matrix */ 842 upTriFactor->csrMat = new CsrMatrix; 843 upTriFactor->csrMat->num_rows = A->rmap->n; 844 upTriFactor->csrMat->num_cols = A->cmap->n; 845 upTriFactor->csrMat->num_entries = a->nz; 846 847 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 848 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 849 850 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 852 853 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 855 856 /* set the operation */ 857 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 858 859 /* Create the solve analysis information */ 860 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 861 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 862 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 863 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 864 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 865 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 866 #endif 867 868 /* perform the solve analysis */ 869 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 870 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 871 872 PetscCallCUDA(WaitForCUDA()); 873 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 874 875 /* assign the pointer */ 876 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 877 878 /* allocate space for the triangular factor information */ 879 PetscCall(PetscNew(&loTriFactor)); 880 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 881 882 /* Create the matrix description */ 883 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 884 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 885 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 886 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 887 #else 888 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 889 #endif 890 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 891 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 892 893 /* set the operation */ 894 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 895 896 /* set the matrix */ 897 loTriFactor->csrMat = new CsrMatrix; 898 loTriFactor->csrMat->num_rows = A->rmap->n; 899 loTriFactor->csrMat->num_cols = A->cmap->n; 900 loTriFactor->csrMat->num_entries = a->nz; 901 902 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 903 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 904 905 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 906 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 907 908 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 909 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 910 911 /* Create the solve analysis information */ 912 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 913 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 914 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 915 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 916 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 917 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 918 #endif 919 920 /* perform the solve analysis */ 921 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 922 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 923 924 PetscCallCUDA(WaitForCUDA()); 925 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 926 927 /* assign the pointer */ 928 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 929 930 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 931 PetscCallCUDA(cudaFreeHost(AiUp)); 932 PetscCallCUDA(cudaFreeHost(AjUp)); 933 } else { 934 /* Fill the upper triangular matrix */ 935 offset = 0; 936 for (i = 0; i < n; i++) { 937 /* set the pointers */ 938 v = aa + ai[i]; 939 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 940 941 /* first, set the diagonal elements */ 942 AAUp[offset] = 1.0 / v[nz]; 943 AALo[offset] = 1.0 / v[nz]; 944 945 offset += 1; 946 if (nz > 0) { 947 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 948 for (j = offset; j < offset + nz; j++) { 949 AAUp[j] = -AAUp[j]; 950 AALo[j] = AAUp[j] / v[nz]; 951 } 952 offset += nz; 953 } 954 } 955 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 956 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 957 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 958 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 959 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 960 } 961 PetscCallCUDA(cudaFreeHost(AAUp)); 962 PetscCallCUDA(cudaFreeHost(AALo)); 963 } catch (char *ex) { 964 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 965 } 966 } 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 #endif 970 971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 972 { 973 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 974 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 975 IS ip = a->row; 976 PetscBool perm_identity; 977 PetscInt n = A->rmap->n; 978 979 PetscFunctionBegin; 980 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 981 982 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 983 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 984 #else 985 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 986 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 987 #endif 988 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 989 990 A->offloadmask = PETSC_OFFLOAD_BOTH; 991 992 /* lower triangular indices */ 993 PetscCall(ISIdentity(ip, &perm_identity)); 994 if (!perm_identity) { 995 IS iip; 996 const PetscInt *irip, *rip; 997 998 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 999 PetscCall(ISGetIndices(iip, &irip)); 1000 PetscCall(ISGetIndices(ip, &rip)); 1001 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1002 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1003 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1004 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1005 PetscCall(ISRestoreIndices(iip, &irip)); 1006 PetscCall(ISDestroy(&iip)); 1007 PetscCall(ISRestoreIndices(ip, &rip)); 1008 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1009 } 1010 PetscFunctionReturn(PETSC_SUCCESS); 1011 } 1012 1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1014 { 1015 PetscFunctionBegin; 1016 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1017 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1018 B->offloadmask = PETSC_OFFLOAD_CPU; 1019 1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 1021 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1022 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1023 #else 1024 /* determine which version of MatSolve needs to be used. */ 1025 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1026 IS ip = b->row; 1027 PetscBool perm_identity; 1028 1029 PetscCall(ISIdentity(ip, &perm_identity)); 1030 if (perm_identity) { 1031 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1032 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1033 } else { 1034 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1035 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1036 } 1037 #endif 1038 B->ops->matsolve = NULL; 1039 B->ops->matsolvetranspose = NULL; 1040 1041 /* get the triangular factors */ 1042 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1043 PetscFunctionReturn(PETSC_SUCCESS); 1044 } 1045 1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 3, 0) 1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1048 { 1049 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1050 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1051 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1052 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1053 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1054 cusparseIndexBase_t indexBase; 1055 cusparseMatrixType_t matrixType; 1056 cusparseFillMode_t fillMode; 1057 cusparseDiagType_t diagType; 1058 1059 PetscFunctionBegin; 1060 /* allocate space for the transpose of the lower triangular factor */ 1061 PetscCall(PetscNew(&loTriFactorT)); 1062 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1063 1064 /* set the matrix descriptors of the lower triangular factor */ 1065 matrixType = cusparseGetMatType(loTriFactor->descr); 1066 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1067 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1068 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1069 1070 /* Create the matrix description */ 1071 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1072 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1073 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1074 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1075 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1076 1077 /* set the operation */ 1078 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1079 1080 /* allocate GPU space for the CSC of the lower triangular factor*/ 1081 loTriFactorT->csrMat = new CsrMatrix; 1082 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1083 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1084 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1085 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1086 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1087 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1088 1089 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1090 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1091 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1092 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1093 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1094 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1095 #endif 1096 1097 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1098 { 1099 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1100 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1101 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1102 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1103 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1104 #else 1105 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1106 #endif 1107 PetscCallCUSPARSE(stat); 1108 } 1109 1110 PetscCallCUDA(WaitForCUDA()); 1111 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1112 1113 /* Create the solve analysis information */ 1114 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1115 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1116 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1117 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1118 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1119 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1120 #endif 1121 1122 /* perform the solve analysis */ 1123 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1124 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1125 1126 PetscCallCUDA(WaitForCUDA()); 1127 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1128 1129 /* assign the pointer */ 1130 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1131 1132 /*********************************************/ 1133 /* Now the Transpose of the Upper Tri Factor */ 1134 /*********************************************/ 1135 1136 /* allocate space for the transpose of the upper triangular factor */ 1137 PetscCall(PetscNew(&upTriFactorT)); 1138 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1139 1140 /* set the matrix descriptors of the upper triangular factor */ 1141 matrixType = cusparseGetMatType(upTriFactor->descr); 1142 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1143 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1144 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1145 1146 /* Create the matrix description */ 1147 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1148 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1149 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1150 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1151 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1152 1153 /* set the operation */ 1154 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1155 1156 /* allocate GPU space for the CSC of the upper triangular factor*/ 1157 upTriFactorT->csrMat = new CsrMatrix; 1158 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1159 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1160 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1161 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1162 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1163 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1164 1165 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1166 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1167 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1168 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1169 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1170 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1171 #endif 1172 1173 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1174 { 1175 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1176 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1177 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1178 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1179 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1180 #else 1181 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1182 #endif 1183 PetscCallCUSPARSE(stat); 1184 } 1185 1186 PetscCallCUDA(WaitForCUDA()); 1187 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1188 1189 /* Create the solve analysis information */ 1190 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1191 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1192 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1193 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1194 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1195 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1196 #endif 1197 1198 /* perform the solve analysis */ 1199 /* christ, would it have killed you to put this stuff in a function????????? */ 1200 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1201 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1202 1203 PetscCallCUDA(WaitForCUDA()); 1204 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1205 1206 /* assign the pointer */ 1207 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1208 PetscFunctionReturn(PETSC_SUCCESS); 1209 } 1210 #endif 1211 1212 struct PetscScalarToPetscInt { 1213 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1214 }; 1215 1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1217 { 1218 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1219 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1221 cusparseStatus_t stat; 1222 cusparseIndexBase_t indexBase; 1223 1224 PetscFunctionBegin; 1225 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1226 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1227 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1228 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1229 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1230 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1231 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1232 PetscCall(PetscLogGpuTimeBegin()); 1233 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1234 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1235 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1236 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1237 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1238 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1239 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1240 1241 /* set alpha and beta */ 1242 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1243 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1244 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1245 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1246 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1247 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1248 1249 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1250 CsrMatrix *matrixT = new CsrMatrix; 1251 matstructT->mat = matrixT; 1252 matrixT->num_rows = A->cmap->n; 1253 matrixT->num_cols = A->rmap->n; 1254 matrixT->num_entries = a->nz; 1255 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1256 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1257 matrixT->values = new THRUSTARRAY(a->nz); 1258 1259 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1260 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1261 1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1264 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265 indexBase, cusparse_scalartype); 1266 PetscCallCUSPARSE(stat); 1267 #else 1268 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1269 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1270 1271 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1272 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1273 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1274 */ 1275 if (matrixT->num_entries) { 1276 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1277 PetscCallCUSPARSE(stat); 1278 1279 } else { 1280 matstructT->matDescr = NULL; 1281 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1282 } 1283 #endif 1284 #endif 1285 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1287 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1288 #else 1289 CsrMatrix *temp = new CsrMatrix; 1290 CsrMatrix *tempT = new CsrMatrix; 1291 /* First convert HYB to CSR */ 1292 temp->num_rows = A->rmap->n; 1293 temp->num_cols = A->cmap->n; 1294 temp->num_entries = a->nz; 1295 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1296 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1297 temp->values = new THRUSTARRAY(a->nz); 1298 1299 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1300 PetscCallCUSPARSE(stat); 1301 1302 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1303 tempT->num_rows = A->rmap->n; 1304 tempT->num_cols = A->cmap->n; 1305 tempT->num_entries = a->nz; 1306 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1307 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1308 tempT->values = new THRUSTARRAY(a->nz); 1309 1310 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1311 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1312 PetscCallCUSPARSE(stat); 1313 1314 /* Last, convert CSC to HYB */ 1315 cusparseHybMat_t hybMat; 1316 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1317 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1318 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1319 PetscCallCUSPARSE(stat); 1320 1321 /* assign the pointer */ 1322 matstructT->mat = hybMat; 1323 A->transupdated = PETSC_TRUE; 1324 /* delete temporaries */ 1325 if (tempT) { 1326 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1327 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1328 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1329 delete (CsrMatrix *)tempT; 1330 } 1331 if (temp) { 1332 if (temp->values) delete (THRUSTARRAY *)temp->values; 1333 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1334 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1335 delete (CsrMatrix *)temp; 1336 } 1337 #endif 1338 } 1339 } 1340 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1341 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1342 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1343 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1344 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1345 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1346 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1347 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1348 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1349 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1350 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1351 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1352 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1353 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1354 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1355 } 1356 if (!cusparsestruct->csr2csc_i) { 1357 THRUSTARRAY csr2csc_a(matrix->num_entries); 1358 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1359 1360 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1362 void *csr2cscBuffer; 1363 size_t csr2cscBufferSize; 1364 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1365 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1366 PetscCallCUSPARSE(stat); 1367 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1368 #endif 1369 1370 if (matrix->num_entries) { 1371 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1372 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1373 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1374 1375 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1376 should be filled with indexBase. So I just take a shortcut here. 1377 */ 1378 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1380 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1381 PetscCallCUSPARSE(stat); 1382 #else 1383 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1384 PetscCallCUSPARSE(stat); 1385 #endif 1386 } else { 1387 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1388 } 1389 1390 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1391 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1393 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1394 #endif 1395 } 1396 PetscCallThrust( 1397 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1398 } 1399 PetscCall(PetscLogGpuTimeEnd()); 1400 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1401 /* the compressed row indices is not used for matTranspose */ 1402 matstructT->cprowIndices = NULL; 1403 /* assign the pointer */ 1404 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1405 A->transupdated = PETSC_TRUE; 1406 PetscFunctionReturn(PETSC_SUCCESS); 1407 } 1408 1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1411 { 1412 const PetscScalar *barray; 1413 PetscScalar *xarray; 1414 thrust::device_ptr<const PetscScalar> bGPU; 1415 thrust::device_ptr<PetscScalar> xGPU; 1416 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1417 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1418 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1419 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1420 PetscInt m = A->rmap->n; 1421 1422 PetscFunctionBegin; 1423 PetscCall(PetscLogGpuTimeBegin()); 1424 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1425 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1426 xGPU = thrust::device_pointer_cast(xarray); 1427 bGPU = thrust::device_pointer_cast(barray); 1428 1429 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1430 if (fs->rpermIndices) { 1431 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1432 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1433 } else { 1434 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1435 } 1436 1437 // Solve L Y = X 1438 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1439 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1440 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1441 1442 // Solve U X = Y 1443 if (fs->cpermIndices) { 1444 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1445 } else { 1446 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1447 } 1448 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1449 1450 // Reorder X with the column permutation if needed, and put the result back to x 1451 if (fs->cpermIndices) { 1452 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1453 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1454 } 1455 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1456 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1457 PetscCall(PetscLogGpuTimeEnd()); 1458 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1459 PetscFunctionReturn(PETSC_SUCCESS); 1460 } 1461 1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1463 { 1464 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1465 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1466 const PetscScalar *barray; 1467 PetscScalar *xarray; 1468 thrust::device_ptr<const PetscScalar> bGPU; 1469 thrust::device_ptr<PetscScalar> xGPU; 1470 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1471 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1472 PetscInt m = A->rmap->n; 1473 1474 PetscFunctionBegin; 1475 PetscCall(PetscLogGpuTimeBegin()); 1476 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1477 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1478 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1479 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1480 1481 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1482 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1483 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1484 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1485 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1486 } 1487 1488 if (!fs->updatedTransposeSpSVAnalysis) { 1489 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1490 1491 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1492 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1493 } 1494 1495 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1496 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1497 xGPU = thrust::device_pointer_cast(xarray); 1498 bGPU = thrust::device_pointer_cast(barray); 1499 1500 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1501 if (fs->rpermIndices) { 1502 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1503 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1504 } else { 1505 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1506 } 1507 1508 // Solve Ut Y = X 1509 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1510 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1511 1512 // Solve Lt X = Y 1513 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1514 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1515 } else { 1516 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1517 } 1518 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1519 1520 // Reorder X with the column permutation if needed, and put the result back to x 1521 if (fs->cpermIndices) { 1522 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1523 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1524 } 1525 1526 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1527 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1528 PetscCall(PetscLogGpuTimeEnd()); 1529 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1530 PetscFunctionReturn(PETSC_SUCCESS); 1531 } 1532 #else 1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1535 { 1536 PetscInt n = xx->map->n; 1537 const PetscScalar *barray; 1538 PetscScalar *xarray; 1539 thrust::device_ptr<const PetscScalar> bGPU; 1540 thrust::device_ptr<PetscScalar> xGPU; 1541 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1542 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1543 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1544 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1545 1546 PetscFunctionBegin; 1547 /* Analyze the matrix and create the transpose ... on the fly */ 1548 if (!loTriFactorT && !upTriFactorT) { 1549 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1550 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1551 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1552 } 1553 1554 /* Get the GPU pointers */ 1555 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1556 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1557 xGPU = thrust::device_pointer_cast(xarray); 1558 bGPU = thrust::device_pointer_cast(barray); 1559 1560 PetscCall(PetscLogGpuTimeBegin()); 1561 /* First, reorder with the row permutation */ 1562 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1563 1564 /* First, solve U */ 1565 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1566 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1567 1568 /* Then, solve L */ 1569 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1570 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1571 1572 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1573 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1574 1575 /* Copy the temporary to the full solution. */ 1576 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1577 1578 /* restore */ 1579 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1580 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1581 PetscCall(PetscLogGpuTimeEnd()); 1582 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1583 PetscFunctionReturn(PETSC_SUCCESS); 1584 } 1585 1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1587 { 1588 const PetscScalar *barray; 1589 PetscScalar *xarray; 1590 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1591 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1593 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1594 1595 PetscFunctionBegin; 1596 /* Analyze the matrix and create the transpose ... on the fly */ 1597 if (!loTriFactorT && !upTriFactorT) { 1598 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1599 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1600 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1601 } 1602 1603 /* Get the GPU pointers */ 1604 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1605 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1606 1607 PetscCall(PetscLogGpuTimeBegin()); 1608 /* First, solve U */ 1609 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1610 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1611 1612 /* Then, solve L */ 1613 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1614 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1615 1616 /* restore */ 1617 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1618 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1619 PetscCall(PetscLogGpuTimeEnd()); 1620 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1625 { 1626 const PetscScalar *barray; 1627 PetscScalar *xarray; 1628 thrust::device_ptr<const PetscScalar> bGPU; 1629 thrust::device_ptr<PetscScalar> xGPU; 1630 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1631 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1633 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1634 1635 PetscFunctionBegin; 1636 /* Get the GPU pointers */ 1637 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1638 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1639 xGPU = thrust::device_pointer_cast(xarray); 1640 bGPU = thrust::device_pointer_cast(barray); 1641 1642 PetscCall(PetscLogGpuTimeBegin()); 1643 /* First, reorder with the row permutation */ 1644 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1645 1646 /* Next, solve L */ 1647 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1648 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1649 1650 /* Then, solve U */ 1651 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1652 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1653 1654 /* Last, reorder with the column permutation */ 1655 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1656 1657 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1658 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1659 PetscCall(PetscLogGpuTimeEnd()); 1660 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1661 PetscFunctionReturn(PETSC_SUCCESS); 1662 } 1663 1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1665 { 1666 const PetscScalar *barray; 1667 PetscScalar *xarray; 1668 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1669 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1670 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1671 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1672 1673 PetscFunctionBegin; 1674 /* Get the GPU pointers */ 1675 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1676 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1677 1678 PetscCall(PetscLogGpuTimeBegin()); 1679 /* First, solve L */ 1680 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1681 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1682 1683 /* Next, solve U */ 1684 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1685 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1686 1687 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1688 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1689 PetscCall(PetscLogGpuTimeEnd()); 1690 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1691 PetscFunctionReturn(PETSC_SUCCESS); 1692 } 1693 #endif 1694 1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1697 { 1698 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1699 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1700 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1701 CsrMatrix *Acsr; 1702 PetscInt m, nz; 1703 PetscBool flg; 1704 1705 PetscFunctionBegin; 1706 if (PetscDefined(USE_DEBUG)) { 1707 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1708 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1709 } 1710 1711 /* Copy A's value to fact */ 1712 m = fact->rmap->n; 1713 nz = aij->nz; 1714 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1715 Acsr = (CsrMatrix *)Acusp->mat->mat; 1716 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1717 1718 /* Factorize fact inplace */ 1719 if (m) 1720 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1721 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1722 if (PetscDefined(USE_DEBUG)) { 1723 int numerical_zero; 1724 cusparseStatus_t status; 1725 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1726 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1727 } 1728 1729 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1730 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1731 */ 1732 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1733 1734 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1735 1736 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1737 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1738 1739 fact->offloadmask = PETSC_OFFLOAD_GPU; 1740 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1741 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1742 fact->ops->matsolve = NULL; 1743 fact->ops->matsolvetranspose = NULL; 1744 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1745 PetscFunctionReturn(PETSC_SUCCESS); 1746 } 1747 1748 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1749 { 1750 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1751 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1752 PetscInt m, nz; 1753 1754 PetscFunctionBegin; 1755 if (PetscDefined(USE_DEBUG)) { 1756 PetscInt i; 1757 PetscBool flg, missing; 1758 1759 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1760 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1761 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1762 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1763 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1764 } 1765 1766 /* Free the old stale stuff */ 1767 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1768 1769 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1770 but they will not be used. Allocate them just for easy debugging. 1771 */ 1772 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1773 1774 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1775 fact->factortype = MAT_FACTOR_ILU; 1776 fact->info.factor_mallocs = 0; 1777 fact->info.fill_ratio_given = info->fill; 1778 fact->info.fill_ratio_needed = 1.0; 1779 1780 aij->row = NULL; 1781 aij->col = NULL; 1782 1783 /* ====================================================================== */ 1784 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1785 /* We'll do in-place factorization on fact */ 1786 /* ====================================================================== */ 1787 const int *Ai, *Aj; 1788 1789 m = fact->rmap->n; 1790 nz = aij->nz; 1791 1792 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1))); 1793 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz)); 1794 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz)); 1795 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1796 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1797 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1798 1799 /* ====================================================================== */ 1800 /* Create descriptors for M, L, U */ 1801 /* ====================================================================== */ 1802 cusparseFillMode_t fillMode; 1803 cusparseDiagType_t diagType; 1804 1805 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1806 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1807 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1808 1809 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1810 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1811 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1812 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1813 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1814 */ 1815 fillMode = CUSPARSE_FILL_MODE_LOWER; 1816 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1817 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1818 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1819 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1820 1821 fillMode = CUSPARSE_FILL_MODE_UPPER; 1822 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1823 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1824 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1825 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1826 1827 /* ========================================================================= */ 1828 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1829 /* ========================================================================= */ 1830 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1831 if (m) 1832 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1833 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1834 1835 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1836 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1837 1838 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1839 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1840 1841 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1842 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1843 1844 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1845 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1846 1847 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1848 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1849 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1850 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1851 */ 1852 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1853 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1854 fs->spsvBuffer_L = fs->factBuffer_M; 1855 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1856 } else { 1857 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1858 fs->spsvBuffer_U = fs->factBuffer_M; 1859 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1860 } 1861 1862 /* ========================================================================== */ 1863 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1864 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1865 /* ========================================================================== */ 1866 int structural_zero; 1867 cusparseStatus_t status; 1868 1869 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1870 if (m) 1871 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1872 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1873 if (PetscDefined(USE_DEBUG)) { 1874 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1875 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1876 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1877 } 1878 1879 /* Estimate FLOPs of the numeric factorization */ 1880 { 1881 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1882 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1883 PetscLogDouble flops = 0.0; 1884 1885 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1886 Ai = Aseq->i; 1887 Adiag = Aseq->diag; 1888 for (PetscInt i = 0; i < m; i++) { 1889 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1890 nzRow = Ai[i + 1] - Ai[i]; 1891 nzLeft = Adiag[i] - Ai[i]; 1892 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1893 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1894 */ 1895 nzLeft = (nzRow - 1) / 2; 1896 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1897 } 1898 } 1899 fs->numericFactFlops = flops; 1900 } 1901 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1902 PetscFunctionReturn(PETSC_SUCCESS); 1903 } 1904 1905 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1906 { 1907 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1908 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1909 const PetscScalar *barray; 1910 PetscScalar *xarray; 1911 1912 PetscFunctionBegin; 1913 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1914 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1915 PetscCall(PetscLogGpuTimeBegin()); 1916 1917 /* Solve L*y = b */ 1918 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1919 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1920 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1921 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1922 1923 /* Solve Lt*x = y */ 1924 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1925 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1926 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1927 1928 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1929 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1930 1931 PetscCall(PetscLogGpuTimeEnd()); 1932 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1933 PetscFunctionReturn(PETSC_SUCCESS); 1934 } 1935 1936 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1937 { 1938 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1939 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1940 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1941 CsrMatrix *Acsr; 1942 PetscInt m, nz; 1943 PetscBool flg; 1944 1945 PetscFunctionBegin; 1946 if (PetscDefined(USE_DEBUG)) { 1947 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1948 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1949 } 1950 1951 /* Copy A's value to fact */ 1952 m = fact->rmap->n; 1953 nz = aij->nz; 1954 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1955 Acsr = (CsrMatrix *)Acusp->mat->mat; 1956 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1957 1958 /* Factorize fact inplace */ 1959 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1960 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1961 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1962 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1963 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1964 */ 1965 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1966 if (PetscDefined(USE_DEBUG)) { 1967 int numerical_zero; 1968 cusparseStatus_t status; 1969 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1970 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1971 } 1972 1973 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1974 1975 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1976 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1977 */ 1978 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1979 1980 fact->offloadmask = PETSC_OFFLOAD_GPU; 1981 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1982 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1983 fact->ops->matsolve = NULL; 1984 fact->ops->matsolvetranspose = NULL; 1985 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1986 PetscFunctionReturn(PETSC_SUCCESS); 1987 } 1988 1989 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1990 { 1991 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1992 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1993 PetscInt m, nz; 1994 1995 PetscFunctionBegin; 1996 if (PetscDefined(USE_DEBUG)) { 1997 PetscInt i; 1998 PetscBool flg, missing; 1999 2000 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2001 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2002 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2003 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2004 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2005 } 2006 2007 /* Free the old stale stuff */ 2008 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2009 2010 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2011 but they will not be used. Allocate them just for easy debugging. 2012 */ 2013 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2014 2015 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2016 fact->factortype = MAT_FACTOR_ICC; 2017 fact->info.factor_mallocs = 0; 2018 fact->info.fill_ratio_given = info->fill; 2019 fact->info.fill_ratio_needed = 1.0; 2020 2021 aij->row = NULL; 2022 aij->col = NULL; 2023 2024 /* ====================================================================== */ 2025 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2026 /* We'll do in-place factorization on fact */ 2027 /* ====================================================================== */ 2028 const int *Ai, *Aj; 2029 2030 m = fact->rmap->n; 2031 nz = aij->nz; 2032 2033 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1))); 2034 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz)); 2035 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2036 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2037 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2038 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2039 2040 /* ====================================================================== */ 2041 /* Create mat descriptors for M, L */ 2042 /* ====================================================================== */ 2043 cusparseFillMode_t fillMode; 2044 cusparseDiagType_t diagType; 2045 2046 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2047 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2048 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2049 2050 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2051 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2052 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2053 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2054 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2055 */ 2056 fillMode = CUSPARSE_FILL_MODE_LOWER; 2057 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2058 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2059 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2060 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2061 2062 /* ========================================================================= */ 2063 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2064 /* ========================================================================= */ 2065 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2066 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2067 2068 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2069 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2070 2071 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2072 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2073 2074 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2075 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2076 2077 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2078 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2079 2080 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2081 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2082 */ 2083 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2084 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2085 fs->spsvBuffer_L = fs->factBuffer_M; 2086 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2087 } else { 2088 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2089 fs->spsvBuffer_Lt = fs->factBuffer_M; 2090 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2091 } 2092 2093 /* ========================================================================== */ 2094 /* Perform analysis of ic0 on M */ 2095 /* The lower triangular part of M has the same sparsity pattern as L */ 2096 /* ========================================================================== */ 2097 int structural_zero; 2098 cusparseStatus_t status; 2099 2100 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2101 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2102 if (PetscDefined(USE_DEBUG)) { 2103 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2104 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2105 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2106 } 2107 2108 /* Estimate FLOPs of the numeric factorization */ 2109 { 2110 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2111 PetscInt *Ai, nzRow, nzLeft; 2112 PetscLogDouble flops = 0.0; 2113 2114 Ai = Aseq->i; 2115 for (PetscInt i = 0; i < m; i++) { 2116 nzRow = Ai[i + 1] - Ai[i]; 2117 if (nzRow > 1) { 2118 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2119 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2120 */ 2121 nzLeft = (nzRow - 1) / 2; 2122 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2123 } 2124 } 2125 fs->numericFactFlops = flops; 2126 } 2127 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2128 PetscFunctionReturn(PETSC_SUCCESS); 2129 } 2130 #endif 2131 2132 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2133 { 2134 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(B->spptr); 2135 2136 PetscFunctionBegin; 2137 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2138 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2139 B->offloadmask = PETSC_OFFLOAD_CPU; 2140 2141 if (!cusparsestruct->use_cpu_solve) { 2142 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 2143 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2144 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2145 #else 2146 /* determine which version of MatSolve needs to be used. */ 2147 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2148 IS isrow = b->row, iscol = b->col; 2149 PetscBool row_identity, col_identity; 2150 2151 PetscCall(ISIdentity(isrow, &row_identity)); 2152 PetscCall(ISIdentity(iscol, &col_identity)); 2153 if (row_identity && col_identity) { 2154 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2155 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2156 } else { 2157 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2158 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2159 } 2160 #endif 2161 } 2162 B->ops->matsolve = NULL; 2163 B->ops->matsolvetranspose = NULL; 2164 2165 /* get the triangular factors */ 2166 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2167 PetscFunctionReturn(PETSC_SUCCESS); 2168 } 2169 2170 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2171 { 2172 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2173 2174 PetscFunctionBegin; 2175 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2176 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2177 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2178 PetscFunctionReturn(PETSC_SUCCESS); 2179 } 2180 2181 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2182 { 2183 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2184 2185 PetscFunctionBegin; 2186 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 2187 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2188 if (cusparseTriFactors->factorizeOnDevice) { 2189 PetscCall(ISIdentity(isrow, &row_identity)); 2190 PetscCall(ISIdentity(iscol, &col_identity)); 2191 } 2192 if (!info->levels && row_identity && col_identity) { 2193 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2194 } else 2195 #endif 2196 { 2197 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2198 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2199 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2200 } 2201 PetscFunctionReturn(PETSC_SUCCESS); 2202 } 2203 2204 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2205 { 2206 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2207 2208 PetscFunctionBegin; 2209 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 2210 PetscBool perm_identity = PETSC_FALSE; 2211 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2212 if (!info->levels && perm_identity) { 2213 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2214 } else 2215 #endif 2216 { 2217 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2218 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2219 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2220 } 2221 PetscFunctionReturn(PETSC_SUCCESS); 2222 } 2223 2224 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2225 { 2226 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2227 2228 PetscFunctionBegin; 2229 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2230 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2231 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2232 PetscFunctionReturn(PETSC_SUCCESS); 2233 } 2234 2235 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2236 { 2237 PetscFunctionBegin; 2238 *type = MATSOLVERCUSPARSE; 2239 PetscFunctionReturn(PETSC_SUCCESS); 2240 } 2241 2242 /*MC 2243 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2244 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2245 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2246 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2247 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2248 algorithms are not recommended. This class does NOT support direct solver operations. 2249 2250 Level: beginner 2251 2252 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2253 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2254 M*/ 2255 2256 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2257 { 2258 PetscInt n = A->rmap->n; 2259 PetscBool factOnDevice, factOnHost; 2260 char *prefix; 2261 char factPlace[32] = "device"; /* the default */ 2262 2263 PetscFunctionBegin; 2264 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2265 PetscCall(MatSetSizes(*B, n, n, n, n)); 2266 (*B)->factortype = ftype; 2267 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2268 2269 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2270 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2271 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2272 PetscOptionsEnd(); 2273 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2274 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2275 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2276 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2277 2278 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2279 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2280 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2281 if (!A->boundtocpu) { 2282 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2283 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2284 } else { 2285 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2286 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2287 } 2288 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2289 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2290 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2291 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2292 if (!A->boundtocpu) { 2293 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2294 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2295 } else { 2296 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2297 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2298 } 2299 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2300 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2301 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2302 2303 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2304 (*B)->canuseordering = PETSC_TRUE; 2305 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2306 PetscFunctionReturn(PETSC_SUCCESS); 2307 } 2308 2309 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2310 { 2311 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2312 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2313 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 2314 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2315 #endif 2316 2317 PetscFunctionBegin; 2318 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2319 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2320 if (A->factortype == MAT_FACTOR_NONE) { 2321 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2322 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2323 } 2324 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 2325 else if (fs->csrVal) { 2326 /* We have a factorized matrix on device and are able to copy it to host */ 2327 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2328 } 2329 #endif 2330 else 2331 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2332 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2333 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2334 A->offloadmask = PETSC_OFFLOAD_BOTH; 2335 } 2336 PetscFunctionReturn(PETSC_SUCCESS); 2337 } 2338 2339 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2340 { 2341 PetscFunctionBegin; 2342 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2343 *array = ((Mat_SeqAIJ *)A->data)->a; 2344 PetscFunctionReturn(PETSC_SUCCESS); 2345 } 2346 2347 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2348 { 2349 PetscFunctionBegin; 2350 A->offloadmask = PETSC_OFFLOAD_CPU; 2351 *array = NULL; 2352 PetscFunctionReturn(PETSC_SUCCESS); 2353 } 2354 2355 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2356 { 2357 PetscFunctionBegin; 2358 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2359 *array = ((Mat_SeqAIJ *)A->data)->a; 2360 PetscFunctionReturn(PETSC_SUCCESS); 2361 } 2362 2363 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2364 { 2365 PetscFunctionBegin; 2366 *array = NULL; 2367 PetscFunctionReturn(PETSC_SUCCESS); 2368 } 2369 2370 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2371 { 2372 PetscFunctionBegin; 2373 *array = ((Mat_SeqAIJ *)A->data)->a; 2374 PetscFunctionReturn(PETSC_SUCCESS); 2375 } 2376 2377 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2378 { 2379 PetscFunctionBegin; 2380 A->offloadmask = PETSC_OFFLOAD_CPU; 2381 *array = NULL; 2382 PetscFunctionReturn(PETSC_SUCCESS); 2383 } 2384 2385 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2386 { 2387 Mat_SeqAIJCUSPARSE *cusp; 2388 CsrMatrix *matrix; 2389 2390 PetscFunctionBegin; 2391 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2392 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2393 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2394 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2395 matrix = (CsrMatrix *)cusp->mat->mat; 2396 2397 if (i) { 2398 #if !defined(PETSC_USE_64BIT_INDICES) 2399 *i = matrix->row_offsets->data().get(); 2400 #else 2401 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2402 #endif 2403 } 2404 if (j) { 2405 #if !defined(PETSC_USE_64BIT_INDICES) 2406 *j = matrix->column_indices->data().get(); 2407 #else 2408 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2409 #endif 2410 } 2411 if (a) *a = matrix->values->data().get(); 2412 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2413 PetscFunctionReturn(PETSC_SUCCESS); 2414 } 2415 2416 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2417 { 2418 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2419 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2420 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2421 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2422 cusparseStatus_t stat; 2423 PetscBool both = PETSC_TRUE; 2424 2425 PetscFunctionBegin; 2426 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2427 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2428 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2429 CsrMatrix *matrix; 2430 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2431 2432 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2433 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2434 matrix->values->assign(a->a, a->a + a->nz); 2435 PetscCallCUDA(WaitForCUDA()); 2436 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2437 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2438 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2439 } else { 2440 PetscInt nnz; 2441 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2442 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2443 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2444 delete cusparsestruct->workVector; 2445 delete cusparsestruct->rowoffsets_gpu; 2446 cusparsestruct->workVector = NULL; 2447 cusparsestruct->rowoffsets_gpu = NULL; 2448 try { 2449 if (a->compressedrow.use) { 2450 m = a->compressedrow.nrows; 2451 ii = a->compressedrow.i; 2452 ridx = a->compressedrow.rindex; 2453 } else { 2454 m = A->rmap->n; 2455 ii = a->i; 2456 ridx = NULL; 2457 } 2458 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2459 if (!a->a) { 2460 nnz = ii[m]; 2461 both = PETSC_FALSE; 2462 } else nnz = a->nz; 2463 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2464 2465 /* create cusparse matrix */ 2466 cusparsestruct->nrows = m; 2467 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2468 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2469 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2470 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2471 2472 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2473 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2474 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2475 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2476 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2477 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2478 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2479 2480 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2481 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2482 /* set the matrix */ 2483 CsrMatrix *mat = new CsrMatrix; 2484 mat->num_rows = m; 2485 mat->num_cols = A->cmap->n; 2486 mat->num_entries = nnz; 2487 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2488 mat->row_offsets->assign(ii, ii + m + 1); 2489 2490 mat->column_indices = new THRUSTINTARRAY32(nnz); 2491 mat->column_indices->assign(a->j, a->j + nnz); 2492 2493 mat->values = new THRUSTARRAY(nnz); 2494 if (a->a) mat->values->assign(a->a, a->a + nnz); 2495 2496 /* assign the pointer */ 2497 matstruct->mat = mat; 2498 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2499 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2500 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2501 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2502 PetscCallCUSPARSE(stat); 2503 } 2504 #endif 2505 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2506 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2507 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2508 #else 2509 CsrMatrix *mat = new CsrMatrix; 2510 mat->num_rows = m; 2511 mat->num_cols = A->cmap->n; 2512 mat->num_entries = nnz; 2513 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2514 mat->row_offsets->assign(ii, ii + m + 1); 2515 2516 mat->column_indices = new THRUSTINTARRAY32(nnz); 2517 mat->column_indices->assign(a->j, a->j + nnz); 2518 2519 mat->values = new THRUSTARRAY(nnz); 2520 if (a->a) mat->values->assign(a->a, a->a + nnz); 2521 2522 cusparseHybMat_t hybMat; 2523 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2524 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2525 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2526 PetscCallCUSPARSE(stat); 2527 /* assign the pointer */ 2528 matstruct->mat = hybMat; 2529 2530 if (mat) { 2531 if (mat->values) delete (THRUSTARRAY *)mat->values; 2532 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2533 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2534 delete (CsrMatrix *)mat; 2535 } 2536 #endif 2537 } 2538 2539 /* assign the compressed row indices */ 2540 if (a->compressedrow.use) { 2541 cusparsestruct->workVector = new THRUSTARRAY(m); 2542 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2543 matstruct->cprowIndices->assign(ridx, ridx + m); 2544 tmp = m; 2545 } else { 2546 cusparsestruct->workVector = NULL; 2547 matstruct->cprowIndices = NULL; 2548 tmp = 0; 2549 } 2550 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2551 2552 /* assign the pointer */ 2553 cusparsestruct->mat = matstruct; 2554 } catch (char *ex) { 2555 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2556 } 2557 PetscCallCUDA(WaitForCUDA()); 2558 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2559 cusparsestruct->nonzerostate = A->nonzerostate; 2560 } 2561 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2562 } 2563 PetscFunctionReturn(PETSC_SUCCESS); 2564 } 2565 2566 struct VecCUDAPlusEquals { 2567 template <typename Tuple> 2568 __host__ __device__ void operator()(Tuple t) 2569 { 2570 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2571 } 2572 }; 2573 2574 struct VecCUDAEquals { 2575 template <typename Tuple> 2576 __host__ __device__ void operator()(Tuple t) 2577 { 2578 thrust::get<1>(t) = thrust::get<0>(t); 2579 } 2580 }; 2581 2582 struct VecCUDAEqualsReverse { 2583 template <typename Tuple> 2584 __host__ __device__ void operator()(Tuple t) 2585 { 2586 thrust::get<0>(t) = thrust::get<1>(t); 2587 } 2588 }; 2589 2590 struct MatMatCusparse { 2591 PetscBool cisdense; 2592 PetscScalar *Bt; 2593 Mat X; 2594 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2595 PetscLogDouble flops; 2596 CsrMatrix *Bcsr; 2597 2598 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2599 cusparseSpMatDescr_t matSpBDescr; 2600 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2601 cusparseDnMatDescr_t matBDescr; 2602 cusparseDnMatDescr_t matCDescr; 2603 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2604 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2605 void *dBuffer4; 2606 void *dBuffer5; 2607 #endif 2608 size_t mmBufferSize; 2609 void *mmBuffer; 2610 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2611 cusparseSpGEMMDescr_t spgemmDesc; 2612 #endif 2613 }; 2614 2615 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2616 { 2617 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2618 2619 PetscFunctionBegin; 2620 PetscCallCUDA(cudaFree(mmdata->Bt)); 2621 delete mmdata->Bcsr; 2622 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2623 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2624 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2625 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2626 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2627 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2628 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2629 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2630 #endif 2631 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2632 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2633 #endif 2634 PetscCall(MatDestroy(&mmdata->X)); 2635 PetscCall(PetscFree(data)); 2636 PetscFunctionReturn(PETSC_SUCCESS); 2637 } 2638 2639 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2640 2641 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2642 { 2643 Mat_Product *product = C->product; 2644 Mat A, B; 2645 PetscInt m, n, blda, clda; 2646 PetscBool flg, biscuda; 2647 Mat_SeqAIJCUSPARSE *cusp; 2648 cusparseStatus_t stat; 2649 cusparseOperation_t opA; 2650 const PetscScalar *barray; 2651 PetscScalar *carray; 2652 MatMatCusparse *mmdata; 2653 Mat_SeqAIJCUSPARSEMultStruct *mat; 2654 CsrMatrix *csrmat; 2655 2656 PetscFunctionBegin; 2657 MatCheckProduct(C, 1); 2658 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2659 mmdata = (MatMatCusparse *)product->data; 2660 A = product->A; 2661 B = product->B; 2662 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2663 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2664 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2665 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2666 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2667 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2668 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2669 switch (product->type) { 2670 case MATPRODUCT_AB: 2671 case MATPRODUCT_PtAP: 2672 mat = cusp->mat; 2673 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2674 m = A->rmap->n; 2675 n = B->cmap->n; 2676 break; 2677 case MATPRODUCT_AtB: 2678 if (!A->form_explicit_transpose) { 2679 mat = cusp->mat; 2680 opA = CUSPARSE_OPERATION_TRANSPOSE; 2681 } else { 2682 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2683 mat = cusp->matTranspose; 2684 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2685 } 2686 m = A->cmap->n; 2687 n = B->cmap->n; 2688 break; 2689 case MATPRODUCT_ABt: 2690 case MATPRODUCT_RARt: 2691 mat = cusp->mat; 2692 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2693 m = A->rmap->n; 2694 n = B->rmap->n; 2695 break; 2696 default: 2697 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2698 } 2699 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2700 csrmat = (CsrMatrix *)mat->mat; 2701 /* if the user passed a CPU matrix, copy the data to the GPU */ 2702 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2703 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2704 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2705 2706 PetscCall(MatDenseGetLDA(B, &blda)); 2707 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2708 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2709 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2710 } else { 2711 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2712 PetscCall(MatDenseGetLDA(C, &clda)); 2713 } 2714 2715 PetscCall(PetscLogGpuTimeBegin()); 2716 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2717 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2718 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2719 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2720 size_t mmBufferSize; 2721 if (mmdata->initialized && mmdata->Blda != blda) { 2722 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2723 mmdata->matBDescr = NULL; 2724 } 2725 if (!mmdata->matBDescr) { 2726 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2727 mmdata->Blda = blda; 2728 } 2729 2730 if (mmdata->initialized && mmdata->Clda != clda) { 2731 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2732 mmdata->matCDescr = NULL; 2733 } 2734 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2735 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2736 mmdata->Clda = clda; 2737 } 2738 2739 if (!mat->matDescr) { 2740 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2741 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2742 PetscCallCUSPARSE(stat); 2743 } 2744 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2745 PetscCallCUSPARSE(stat); 2746 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2747 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2748 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2749 mmdata->mmBufferSize = mmBufferSize; 2750 } 2751 mmdata->initialized = PETSC_TRUE; 2752 } else { 2753 /* to be safe, always update pointers of the mats */ 2754 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2755 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2756 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2757 } 2758 2759 /* do cusparseSpMM, which supports transpose on B */ 2760 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2761 PetscCallCUSPARSE(stat); 2762 #else 2763 PetscInt k; 2764 /* cusparseXcsrmm does not support transpose on B */ 2765 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2766 cublasHandle_t cublasv2handle; 2767 cublasStatus_t cerr; 2768 2769 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2770 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2771 PetscCallCUBLAS(cerr); 2772 blda = B->cmap->n; 2773 k = B->cmap->n; 2774 } else { 2775 k = B->rmap->n; 2776 } 2777 2778 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2779 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2780 PetscCallCUSPARSE(stat); 2781 #endif 2782 PetscCall(PetscLogGpuTimeEnd()); 2783 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2784 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2785 if (product->type == MATPRODUCT_RARt) { 2786 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2787 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2788 } else if (product->type == MATPRODUCT_PtAP) { 2789 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2790 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2791 } else { 2792 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2793 } 2794 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2795 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2796 PetscFunctionReturn(PETSC_SUCCESS); 2797 } 2798 2799 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2800 { 2801 Mat_Product *product = C->product; 2802 Mat A, B; 2803 PetscInt m, n; 2804 PetscBool cisdense, flg; 2805 MatMatCusparse *mmdata; 2806 Mat_SeqAIJCUSPARSE *cusp; 2807 2808 PetscFunctionBegin; 2809 MatCheckProduct(C, 1); 2810 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2811 A = product->A; 2812 B = product->B; 2813 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2814 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2815 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2816 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2817 switch (product->type) { 2818 case MATPRODUCT_AB: 2819 m = A->rmap->n; 2820 n = B->cmap->n; 2821 break; 2822 case MATPRODUCT_AtB: 2823 m = A->cmap->n; 2824 n = B->cmap->n; 2825 break; 2826 case MATPRODUCT_ABt: 2827 m = A->rmap->n; 2828 n = B->rmap->n; 2829 break; 2830 case MATPRODUCT_PtAP: 2831 m = B->cmap->n; 2832 n = B->cmap->n; 2833 break; 2834 case MATPRODUCT_RARt: 2835 m = B->rmap->n; 2836 n = B->rmap->n; 2837 break; 2838 default: 2839 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2840 } 2841 PetscCall(MatSetSizes(C, m, n, m, n)); 2842 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2843 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2844 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2845 2846 /* product data */ 2847 PetscCall(PetscNew(&mmdata)); 2848 mmdata->cisdense = cisdense; 2849 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2850 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2851 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2852 #endif 2853 /* for these products we need intermediate storage */ 2854 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2855 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2856 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2857 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2858 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2859 } else { 2860 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2861 } 2862 } 2863 C->product->data = mmdata; 2864 C->product->destroy = MatDestroy_MatMatCusparse; 2865 2866 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2867 PetscFunctionReturn(PETSC_SUCCESS); 2868 } 2869 2870 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2871 { 2872 Mat_Product *product = C->product; 2873 Mat A, B; 2874 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2875 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2876 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2877 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2878 PetscBool flg; 2879 cusparseStatus_t stat; 2880 MatProductType ptype; 2881 MatMatCusparse *mmdata; 2882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2883 cusparseSpMatDescr_t BmatSpDescr; 2884 #endif 2885 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2886 2887 PetscFunctionBegin; 2888 MatCheckProduct(C, 1); 2889 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2890 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2891 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2892 mmdata = (MatMatCusparse *)C->product->data; 2893 A = product->A; 2894 B = product->B; 2895 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2896 mmdata->reusesym = PETSC_FALSE; 2897 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2898 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2899 Cmat = Ccusp->mat; 2900 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2901 Ccsr = (CsrMatrix *)Cmat->mat; 2902 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2903 goto finalize; 2904 } 2905 if (!c->nz) goto finalize; 2906 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2907 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2908 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2909 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2910 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2911 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2912 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2913 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2914 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2915 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2916 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2917 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2918 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2919 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2920 2921 ptype = product->type; 2922 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2923 ptype = MATPRODUCT_AB; 2924 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2925 } 2926 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2927 ptype = MATPRODUCT_AB; 2928 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2929 } 2930 switch (ptype) { 2931 case MATPRODUCT_AB: 2932 Amat = Acusp->mat; 2933 Bmat = Bcusp->mat; 2934 break; 2935 case MATPRODUCT_AtB: 2936 Amat = Acusp->matTranspose; 2937 Bmat = Bcusp->mat; 2938 break; 2939 case MATPRODUCT_ABt: 2940 Amat = Acusp->mat; 2941 Bmat = Bcusp->matTranspose; 2942 break; 2943 default: 2944 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2945 } 2946 Cmat = Ccusp->mat; 2947 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2948 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2949 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2950 Acsr = (CsrMatrix *)Amat->mat; 2951 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2952 Ccsr = (CsrMatrix *)Cmat->mat; 2953 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2954 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2955 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2956 PetscCall(PetscLogGpuTimeBegin()); 2957 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2958 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2959 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2960 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2961 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2962 PetscCallCUSPARSE(stat); 2963 #else 2964 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2965 PetscCallCUSPARSE(stat); 2966 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2967 PetscCallCUSPARSE(stat); 2968 #endif 2969 #else 2970 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2971 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2972 PetscCallCUSPARSE(stat); 2973 #endif 2974 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2975 PetscCallCUDA(WaitForCUDA()); 2976 PetscCall(PetscLogGpuTimeEnd()); 2977 C->offloadmask = PETSC_OFFLOAD_GPU; 2978 finalize: 2979 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2980 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2981 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2982 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2983 c->reallocs = 0; 2984 C->info.mallocs += 0; 2985 C->info.nz_unneeded = 0; 2986 C->assembled = C->was_assembled = PETSC_TRUE; 2987 C->num_ass++; 2988 PetscFunctionReturn(PETSC_SUCCESS); 2989 } 2990 2991 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2992 { 2993 Mat_Product *product = C->product; 2994 Mat A, B; 2995 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2996 Mat_SeqAIJ *a, *b, *c; 2997 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2998 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2999 PetscInt i, j, m, n, k; 3000 PetscBool flg; 3001 cusparseStatus_t stat; 3002 MatProductType ptype; 3003 MatMatCusparse *mmdata; 3004 PetscLogDouble flops; 3005 PetscBool biscompressed, ciscompressed; 3006 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3007 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3008 cusparseSpMatDescr_t BmatSpDescr; 3009 #else 3010 int cnz; 3011 #endif 3012 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3013 3014 PetscFunctionBegin; 3015 MatCheckProduct(C, 1); 3016 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3017 A = product->A; 3018 B = product->B; 3019 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3020 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3021 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3022 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3023 a = (Mat_SeqAIJ *)A->data; 3024 b = (Mat_SeqAIJ *)B->data; 3025 /* product data */ 3026 PetscCall(PetscNew(&mmdata)); 3027 C->product->data = mmdata; 3028 C->product->destroy = MatDestroy_MatMatCusparse; 3029 3030 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3031 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3032 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3033 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3034 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3035 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3036 3037 ptype = product->type; 3038 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3039 ptype = MATPRODUCT_AB; 3040 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3041 } 3042 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3043 ptype = MATPRODUCT_AB; 3044 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3045 } 3046 biscompressed = PETSC_FALSE; 3047 ciscompressed = PETSC_FALSE; 3048 switch (ptype) { 3049 case MATPRODUCT_AB: 3050 m = A->rmap->n; 3051 n = B->cmap->n; 3052 k = A->cmap->n; 3053 Amat = Acusp->mat; 3054 Bmat = Bcusp->mat; 3055 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3056 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3057 break; 3058 case MATPRODUCT_AtB: 3059 m = A->cmap->n; 3060 n = B->cmap->n; 3061 k = A->rmap->n; 3062 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3063 Amat = Acusp->matTranspose; 3064 Bmat = Bcusp->mat; 3065 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3066 break; 3067 case MATPRODUCT_ABt: 3068 m = A->rmap->n; 3069 n = B->rmap->n; 3070 k = A->cmap->n; 3071 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3072 Amat = Acusp->mat; 3073 Bmat = Bcusp->matTranspose; 3074 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3075 break; 3076 default: 3077 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3078 } 3079 3080 /* create cusparse matrix */ 3081 PetscCall(MatSetSizes(C, m, n, m, n)); 3082 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3083 c = (Mat_SeqAIJ *)C->data; 3084 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3085 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3086 Ccsr = new CsrMatrix; 3087 3088 c->compressedrow.use = ciscompressed; 3089 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3090 c->compressedrow.nrows = a->compressedrow.nrows; 3091 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3092 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3093 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3094 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3095 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3096 } else { 3097 c->compressedrow.nrows = 0; 3098 c->compressedrow.i = NULL; 3099 c->compressedrow.rindex = NULL; 3100 Ccusp->workVector = NULL; 3101 Cmat->cprowIndices = NULL; 3102 } 3103 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3104 Ccusp->mat = Cmat; 3105 Ccusp->mat->mat = Ccsr; 3106 Ccsr->num_rows = Ccusp->nrows; 3107 Ccsr->num_cols = n; 3108 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3109 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3110 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3111 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3112 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 3113 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 3114 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 3115 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3116 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3117 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3118 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3119 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3120 c->nz = 0; 3121 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3122 Ccsr->values = new THRUSTARRAY(c->nz); 3123 goto finalizesym; 3124 } 3125 3126 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3127 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3128 Acsr = (CsrMatrix *)Amat->mat; 3129 if (!biscompressed) { 3130 Bcsr = (CsrMatrix *)Bmat->mat; 3131 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3132 BmatSpDescr = Bmat->matDescr; 3133 #endif 3134 } else { /* we need to use row offsets for the full matrix */ 3135 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3136 Bcsr = new CsrMatrix; 3137 Bcsr->num_rows = B->rmap->n; 3138 Bcsr->num_cols = cBcsr->num_cols; 3139 Bcsr->num_entries = cBcsr->num_entries; 3140 Bcsr->column_indices = cBcsr->column_indices; 3141 Bcsr->values = cBcsr->values; 3142 if (!Bcusp->rowoffsets_gpu) { 3143 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3144 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3145 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3146 } 3147 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3148 mmdata->Bcsr = Bcsr; 3149 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3150 if (Bcsr->num_rows && Bcsr->num_cols) { 3151 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3152 PetscCallCUSPARSE(stat); 3153 } 3154 BmatSpDescr = mmdata->matSpBDescr; 3155 #endif 3156 } 3157 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3158 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3159 /* precompute flops count */ 3160 if (ptype == MATPRODUCT_AB) { 3161 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3162 const PetscInt st = a->i[i]; 3163 const PetscInt en = a->i[i + 1]; 3164 for (j = st; j < en; j++) { 3165 const PetscInt brow = a->j[j]; 3166 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3167 } 3168 } 3169 } else if (ptype == MATPRODUCT_AtB) { 3170 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3171 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3172 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3173 flops += (2. * anzi) * bnzi; 3174 } 3175 } else { /* TODO */ 3176 flops = 0.; 3177 } 3178 3179 mmdata->flops = flops; 3180 PetscCall(PetscLogGpuTimeBegin()); 3181 3182 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3183 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3184 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3185 PetscCallCUSPARSE(stat); 3186 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3187 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3188 { 3189 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3190 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3191 */ 3192 void *dBuffer1 = NULL; 3193 void *dBuffer2 = NULL; 3194 void *dBuffer3 = NULL; 3195 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3196 size_t bufferSize1 = 0; 3197 size_t bufferSize2 = 0; 3198 size_t bufferSize3 = 0; 3199 size_t bufferSize4 = 0; 3200 size_t bufferSize5 = 0; 3201 3202 /* ask bufferSize1 bytes for external memory */ 3203 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3204 PetscCallCUSPARSE(stat); 3205 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3206 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3207 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3208 PetscCallCUSPARSE(stat); 3209 3210 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3211 PetscCallCUSPARSE(stat); 3212 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3213 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3214 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3215 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3216 PetscCallCUSPARSE(stat); 3217 PetscCallCUDA(cudaFree(dBuffer1)); 3218 PetscCallCUDA(cudaFree(dBuffer2)); 3219 3220 /* get matrix C non-zero entries C_nnz1 */ 3221 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3222 c->nz = (PetscInt)C_nnz1; 3223 /* allocate matrix C */ 3224 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3225 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3226 Ccsr->values = new THRUSTARRAY(c->nz); 3227 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3228 /* update matC with the new pointers */ 3229 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3230 PetscCallCUSPARSE(stat); 3231 3232 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3233 PetscCallCUSPARSE(stat); 3234 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3235 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3236 PetscCallCUSPARSE(stat); 3237 PetscCallCUDA(cudaFree(dBuffer3)); 3238 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3239 PetscCallCUSPARSE(stat); 3240 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3241 } 3242 #else 3243 size_t bufSize2; 3244 /* ask bufferSize bytes for external memory */ 3245 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3246 PetscCallCUSPARSE(stat); 3247 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3248 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3249 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3250 PetscCallCUSPARSE(stat); 3251 /* ask bufferSize again bytes for external memory */ 3252 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3253 PetscCallCUSPARSE(stat); 3254 /* The CUSPARSE documentation is not clear, nor the API 3255 We need both buffers to perform the operations properly! 3256 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3257 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3258 is stored in the descriptor! What a messy API... */ 3259 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3260 /* compute the intermediate product of A * B */ 3261 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3262 PetscCallCUSPARSE(stat); 3263 /* get matrix C non-zero entries C_nnz1 */ 3264 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3265 c->nz = (PetscInt)C_nnz1; 3266 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3267 mmdata->mmBufferSize / 1024)); 3268 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3269 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3270 Ccsr->values = new THRUSTARRAY(c->nz); 3271 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3272 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3273 PetscCallCUSPARSE(stat); 3274 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3275 PetscCallCUSPARSE(stat); 3276 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3277 #else 3278 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3279 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3280 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3281 PetscCallCUSPARSE(stat); 3282 c->nz = cnz; 3283 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3284 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3285 Ccsr->values = new THRUSTARRAY(c->nz); 3286 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3287 3288 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3289 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3290 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3291 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3292 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3293 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3294 PetscCallCUSPARSE(stat); 3295 #endif 3296 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3297 PetscCall(PetscLogGpuTimeEnd()); 3298 finalizesym: 3299 c->singlemalloc = PETSC_FALSE; 3300 c->free_a = PETSC_TRUE; 3301 c->free_ij = PETSC_TRUE; 3302 PetscCall(PetscMalloc1(m + 1, &c->i)); 3303 PetscCall(PetscMalloc1(c->nz, &c->j)); 3304 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3305 PetscInt *d_i = c->i; 3306 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3307 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3308 ii = *Ccsr->row_offsets; 3309 jj = *Ccsr->column_indices; 3310 if (ciscompressed) d_i = c->compressedrow.i; 3311 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3312 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3313 } else { 3314 PetscInt *d_i = c->i; 3315 if (ciscompressed) d_i = c->compressedrow.i; 3316 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3317 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3318 } 3319 if (ciscompressed) { /* need to expand host row offsets */ 3320 PetscInt r = 0; 3321 c->i[0] = 0; 3322 for (k = 0; k < c->compressedrow.nrows; k++) { 3323 const PetscInt next = c->compressedrow.rindex[k]; 3324 const PetscInt old = c->compressedrow.i[k]; 3325 for (; r < next; r++) c->i[r + 1] = old; 3326 } 3327 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3328 } 3329 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3330 PetscCall(PetscMalloc1(m, &c->ilen)); 3331 PetscCall(PetscMalloc1(m, &c->imax)); 3332 c->maxnz = c->nz; 3333 c->nonzerorowcnt = 0; 3334 c->rmax = 0; 3335 for (k = 0; k < m; k++) { 3336 const PetscInt nn = c->i[k + 1] - c->i[k]; 3337 c->ilen[k] = c->imax[k] = nn; 3338 c->nonzerorowcnt += (PetscInt) !!nn; 3339 c->rmax = PetscMax(c->rmax, nn); 3340 } 3341 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3342 PetscCall(PetscMalloc1(c->nz, &c->a)); 3343 Ccsr->num_entries = c->nz; 3344 3345 C->nonzerostate++; 3346 PetscCall(PetscLayoutSetUp(C->rmap)); 3347 PetscCall(PetscLayoutSetUp(C->cmap)); 3348 Ccusp->nonzerostate = C->nonzerostate; 3349 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3350 C->preallocated = PETSC_TRUE; 3351 C->assembled = PETSC_FALSE; 3352 C->was_assembled = PETSC_FALSE; 3353 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3354 mmdata->reusesym = PETSC_TRUE; 3355 C->offloadmask = PETSC_OFFLOAD_GPU; 3356 } 3357 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3358 PetscFunctionReturn(PETSC_SUCCESS); 3359 } 3360 3361 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3362 3363 /* handles sparse or dense B */ 3364 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3365 { 3366 Mat_Product *product = mat->product; 3367 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3368 3369 PetscFunctionBegin; 3370 MatCheckProduct(mat, 1); 3371 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3372 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3373 if (product->type == MATPRODUCT_ABC) { 3374 Ciscusp = PETSC_FALSE; 3375 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3376 } 3377 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3378 PetscBool usecpu = PETSC_FALSE; 3379 switch (product->type) { 3380 case MATPRODUCT_AB: 3381 if (product->api_user) { 3382 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3383 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3384 PetscOptionsEnd(); 3385 } else { 3386 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3387 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3388 PetscOptionsEnd(); 3389 } 3390 break; 3391 case MATPRODUCT_AtB: 3392 if (product->api_user) { 3393 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3394 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3395 PetscOptionsEnd(); 3396 } else { 3397 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3398 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3399 PetscOptionsEnd(); 3400 } 3401 break; 3402 case MATPRODUCT_PtAP: 3403 if (product->api_user) { 3404 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3405 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3406 PetscOptionsEnd(); 3407 } else { 3408 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3409 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3410 PetscOptionsEnd(); 3411 } 3412 break; 3413 case MATPRODUCT_RARt: 3414 if (product->api_user) { 3415 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3416 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3417 PetscOptionsEnd(); 3418 } else { 3419 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3420 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3421 PetscOptionsEnd(); 3422 } 3423 break; 3424 case MATPRODUCT_ABC: 3425 if (product->api_user) { 3426 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3427 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3428 PetscOptionsEnd(); 3429 } else { 3430 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3431 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3432 PetscOptionsEnd(); 3433 } 3434 break; 3435 default: 3436 break; 3437 } 3438 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3439 } 3440 /* dispatch */ 3441 if (isdense) { 3442 switch (product->type) { 3443 case MATPRODUCT_AB: 3444 case MATPRODUCT_AtB: 3445 case MATPRODUCT_ABt: 3446 case MATPRODUCT_PtAP: 3447 case MATPRODUCT_RARt: 3448 if (product->A->boundtocpu) { 3449 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3450 } else { 3451 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3452 } 3453 break; 3454 case MATPRODUCT_ABC: 3455 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3456 break; 3457 default: 3458 break; 3459 } 3460 } else if (Biscusp && Ciscusp) { 3461 switch (product->type) { 3462 case MATPRODUCT_AB: 3463 case MATPRODUCT_AtB: 3464 case MATPRODUCT_ABt: 3465 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3466 break; 3467 case MATPRODUCT_PtAP: 3468 case MATPRODUCT_RARt: 3469 case MATPRODUCT_ABC: 3470 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3471 break; 3472 default: 3473 break; 3474 } 3475 } else { /* fallback for AIJ */ 3476 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3477 } 3478 PetscFunctionReturn(PETSC_SUCCESS); 3479 } 3480 3481 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3482 { 3483 PetscFunctionBegin; 3484 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3485 PetscFunctionReturn(PETSC_SUCCESS); 3486 } 3487 3488 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3489 { 3490 PetscFunctionBegin; 3491 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3492 PetscFunctionReturn(PETSC_SUCCESS); 3493 } 3494 3495 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3496 { 3497 PetscFunctionBegin; 3498 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3499 PetscFunctionReturn(PETSC_SUCCESS); 3500 } 3501 3502 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3503 { 3504 PetscFunctionBegin; 3505 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3506 PetscFunctionReturn(PETSC_SUCCESS); 3507 } 3508 3509 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3510 { 3511 PetscFunctionBegin; 3512 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3513 PetscFunctionReturn(PETSC_SUCCESS); 3514 } 3515 3516 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3517 { 3518 int i = blockIdx.x * blockDim.x + threadIdx.x; 3519 if (i < n) y[idx[i]] += x[i]; 3520 } 3521 3522 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3523 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3524 { 3525 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3526 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3527 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3528 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3529 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3530 PetscBool compressed; 3531 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3532 PetscInt nx, ny; 3533 #endif 3534 3535 PetscFunctionBegin; 3536 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3537 if (!a->nz) { 3538 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3539 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3540 PetscFunctionReturn(PETSC_SUCCESS); 3541 } 3542 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3543 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3544 if (!trans) { 3545 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3546 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3547 } else { 3548 if (herm || !A->form_explicit_transpose) { 3549 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3550 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3551 } else { 3552 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3553 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3554 } 3555 } 3556 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3557 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3558 3559 try { 3560 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3561 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3562 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3563 3564 PetscCall(PetscLogGpuTimeBegin()); 3565 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3566 /* z = A x + beta y. 3567 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3568 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3569 */ 3570 xptr = xarray; 3571 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3572 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3573 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3574 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3575 allocated to accommodate different uses. So we get the length info directly from mat. 3576 */ 3577 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3578 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3579 nx = mat->num_cols; 3580 ny = mat->num_rows; 3581 } 3582 #endif 3583 } else { 3584 /* z = A^T x + beta y 3585 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3586 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3587 */ 3588 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3589 dptr = zarray; 3590 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3591 if (compressed) { /* Scatter x to work vector */ 3592 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3593 3594 thrust::for_each( 3595 #if PetscDefined(HAVE_THRUST_ASYNC) 3596 thrust::cuda::par.on(PetscDefaultCudaStream), 3597 #endif 3598 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3599 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3600 } 3601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3602 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3603 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3604 nx = mat->num_rows; 3605 ny = mat->num_cols; 3606 } 3607 #endif 3608 } 3609 3610 /* csr_spmv does y = alpha op(A) x + beta y */ 3611 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3612 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3613 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3614 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3615 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3616 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3617 PetscCallCUSPARSE( 3618 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3619 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3620 3621 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3622 } else { 3623 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3624 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3625 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3626 } 3627 3628 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3629 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3630 #else 3631 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3632 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3633 #endif 3634 } else { 3635 if (cusparsestruct->nrows) { 3636 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3637 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3638 #else 3639 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3640 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3641 #endif 3642 } 3643 } 3644 PetscCall(PetscLogGpuTimeEnd()); 3645 3646 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3647 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3648 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3649 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3650 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3651 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3652 } 3653 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3654 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3655 } 3656 3657 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3658 if (compressed) { 3659 PetscCall(PetscLogGpuTimeBegin()); 3660 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3661 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3662 prevent that. So I just add a ScatterAdd kernel. 3663 */ 3664 #if 0 3665 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3666 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3667 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3668 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3669 VecCUDAPlusEquals()); 3670 #else 3671 PetscInt n = matstruct->cprowIndices->size(); 3672 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3673 #endif 3674 PetscCall(PetscLogGpuTimeEnd()); 3675 } 3676 } else { 3677 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3678 } 3679 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3680 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3681 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3682 } catch (char *ex) { 3683 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3684 } 3685 if (yy) { 3686 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3687 } else { 3688 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3689 } 3690 PetscFunctionReturn(PETSC_SUCCESS); 3691 } 3692 3693 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3694 { 3695 PetscFunctionBegin; 3696 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3697 PetscFunctionReturn(PETSC_SUCCESS); 3698 } 3699 3700 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3701 { 3702 PetscObjectState onnz = A->nonzerostate; 3703 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3704 3705 PetscFunctionBegin; 3706 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3707 if (onnz != A->nonzerostate && cusp->deviceMat) { 3708 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 3709 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3710 cusp->deviceMat = NULL; 3711 } 3712 PetscFunctionReturn(PETSC_SUCCESS); 3713 } 3714 3715 /*@ 3716 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3717 (the default parallel PETSc format). This matrix will ultimately pushed down 3718 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3719 assembly performance the user should preallocate the matrix storage by setting 3720 the parameter `nz` (or the array `nnz`). 3721 3722 Collective 3723 3724 Input Parameters: 3725 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3726 . m - number of rows 3727 . n - number of columns 3728 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3729 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3730 3731 Output Parameter: 3732 . A - the matrix 3733 3734 Level: intermediate 3735 3736 Notes: 3737 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3738 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3739 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3740 3741 The AIJ format, also called 3742 compressed row storage, is fully compatible with standard Fortran 3743 storage. That is, the stored row and column indices can begin at 3744 either one (as in Fortran) or zero. 3745 3746 Specify the preallocated storage with either nz or nnz (not both). 3747 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3748 allocation. 3749 3750 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3751 @*/ 3752 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3753 { 3754 PetscFunctionBegin; 3755 PetscCall(MatCreate(comm, A)); 3756 PetscCall(MatSetSizes(*A, m, n, m, n)); 3757 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3758 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3759 PetscFunctionReturn(PETSC_SUCCESS); 3760 } 3761 3762 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3763 { 3764 PetscFunctionBegin; 3765 if (A->factortype == MAT_FACTOR_NONE) { 3766 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3767 } else { 3768 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3769 } 3770 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3771 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3772 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3773 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3774 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3775 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3776 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3777 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3778 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3779 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3780 PetscCall(MatDestroy_SeqAIJ(A)); 3781 PetscFunctionReturn(PETSC_SUCCESS); 3782 } 3783 3784 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3785 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3786 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3787 { 3788 PetscFunctionBegin; 3789 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3790 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3791 PetscFunctionReturn(PETSC_SUCCESS); 3792 } 3793 3794 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3795 { 3796 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3797 Mat_SeqAIJCUSPARSE *cy; 3798 Mat_SeqAIJCUSPARSE *cx; 3799 PetscScalar *ay; 3800 const PetscScalar *ax; 3801 CsrMatrix *csry, *csrx; 3802 3803 PetscFunctionBegin; 3804 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3805 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3806 if (X->ops->axpy != Y->ops->axpy) { 3807 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3808 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3809 PetscFunctionReturn(PETSC_SUCCESS); 3810 } 3811 /* if we are here, it means both matrices are bound to GPU */ 3812 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3813 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3814 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3815 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3816 csry = (CsrMatrix *)cy->mat->mat; 3817 csrx = (CsrMatrix *)cx->mat->mat; 3818 /* see if we can turn this into a cublas axpy */ 3819 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3820 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3821 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3822 if (eq) str = SAME_NONZERO_PATTERN; 3823 } 3824 /* spgeam is buggy with one column */ 3825 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3826 3827 if (str == SUBSET_NONZERO_PATTERN) { 3828 PetscScalar b = 1.0; 3829 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3830 size_t bufferSize; 3831 void *buffer; 3832 #endif 3833 3834 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3835 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3836 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3837 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3838 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3839 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3840 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3841 PetscCall(PetscLogGpuTimeBegin()); 3842 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3843 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3844 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3845 PetscCall(PetscLogGpuTimeEnd()); 3846 PetscCallCUDA(cudaFree(buffer)); 3847 #else 3848 PetscCall(PetscLogGpuTimeBegin()); 3849 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3850 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3851 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3852 PetscCall(PetscLogGpuTimeEnd()); 3853 #endif 3854 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3855 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3856 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3857 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3858 } else if (str == SAME_NONZERO_PATTERN) { 3859 cublasHandle_t cublasv2handle; 3860 PetscBLASInt one = 1, bnz = 1; 3861 3862 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3863 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3864 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3865 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3866 PetscCall(PetscLogGpuTimeBegin()); 3867 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3868 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3869 PetscCall(PetscLogGpuTimeEnd()); 3870 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3871 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3872 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3873 } else { 3874 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3875 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3876 } 3877 PetscFunctionReturn(PETSC_SUCCESS); 3878 } 3879 3880 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3881 { 3882 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3883 PetscScalar *ay; 3884 cublasHandle_t cublasv2handle; 3885 PetscBLASInt one = 1, bnz = 1; 3886 3887 PetscFunctionBegin; 3888 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3889 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3890 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3891 PetscCall(PetscLogGpuTimeBegin()); 3892 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3893 PetscCall(PetscLogGpuFlops(bnz)); 3894 PetscCall(PetscLogGpuTimeEnd()); 3895 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3896 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3897 PetscFunctionReturn(PETSC_SUCCESS); 3898 } 3899 3900 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3901 { 3902 PetscBool both = PETSC_FALSE; 3903 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3904 3905 PetscFunctionBegin; 3906 if (A->factortype == MAT_FACTOR_NONE) { 3907 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3908 if (spptr->mat) { 3909 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3910 if (matrix->values) { 3911 both = PETSC_TRUE; 3912 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3913 } 3914 } 3915 if (spptr->matTranspose) { 3916 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3917 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3918 } 3919 } 3920 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3921 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3922 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3923 else A->offloadmask = PETSC_OFFLOAD_CPU; 3924 PetscFunctionReturn(PETSC_SUCCESS); 3925 } 3926 3927 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3928 { 3929 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3930 3931 PetscFunctionBegin; 3932 if (A->factortype != MAT_FACTOR_NONE) { 3933 A->boundtocpu = flg; 3934 PetscFunctionReturn(PETSC_SUCCESS); 3935 } 3936 if (flg) { 3937 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3938 3939 A->ops->scale = MatScale_SeqAIJ; 3940 A->ops->axpy = MatAXPY_SeqAIJ; 3941 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3942 A->ops->mult = MatMult_SeqAIJ; 3943 A->ops->multadd = MatMultAdd_SeqAIJ; 3944 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3945 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3946 A->ops->multhermitiantranspose = NULL; 3947 A->ops->multhermitiantransposeadd = NULL; 3948 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3949 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3950 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3951 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3952 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3953 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3954 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3955 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3956 } else { 3957 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3958 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3959 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3960 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3961 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3962 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3963 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3964 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3965 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3966 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3967 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3968 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3969 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3970 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3971 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3972 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3973 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3974 3975 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3976 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3977 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3978 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3979 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3980 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3981 } 3982 A->boundtocpu = flg; 3983 if (flg && a->inode.size) { 3984 a->inode.use = PETSC_TRUE; 3985 } else { 3986 a->inode.use = PETSC_FALSE; 3987 } 3988 PetscFunctionReturn(PETSC_SUCCESS); 3989 } 3990 3991 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 3992 { 3993 Mat B; 3994 3995 PetscFunctionBegin; 3996 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3997 if (reuse == MAT_INITIAL_MATRIX) { 3998 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3999 } else if (reuse == MAT_REUSE_MATRIX) { 4000 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4001 } 4002 B = *newmat; 4003 4004 PetscCall(PetscFree(B->defaultvectype)); 4005 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4006 4007 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4008 if (B->factortype == MAT_FACTOR_NONE) { 4009 Mat_SeqAIJCUSPARSE *spptr; 4010 PetscCall(PetscNew(&spptr)); 4011 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4012 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4013 spptr->format = MAT_CUSPARSE_CSR; 4014 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4015 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 4016 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4017 #else 4018 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4019 #endif 4020 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4021 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4022 #endif 4023 B->spptr = spptr; 4024 } else { 4025 Mat_SeqAIJCUSPARSETriFactors *spptr; 4026 4027 PetscCall(PetscNew(&spptr)); 4028 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4029 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4030 B->spptr = spptr; 4031 } 4032 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4033 } 4034 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4035 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4036 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4037 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4038 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4039 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4040 4041 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4042 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4043 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4044 #if defined(PETSC_HAVE_HYPRE) 4045 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4046 #endif 4047 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4048 PetscFunctionReturn(PETSC_SUCCESS); 4049 } 4050 4051 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4052 { 4053 PetscFunctionBegin; 4054 PetscCall(MatCreate_SeqAIJ(B)); 4055 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4056 PetscFunctionReturn(PETSC_SUCCESS); 4057 } 4058 4059 /*MC 4060 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4061 4062 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 4063 CSR, ELL, or Hybrid format. 4064 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4065 4066 Options Database Keys: 4067 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4068 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4069 Other options include ell (ellpack) or hyb (hybrid). 4070 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4071 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4072 4073 Level: beginner 4074 4075 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4076 M*/ 4077 4078 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 4079 4080 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4081 { 4082 PetscFunctionBegin; 4083 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 4084 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4085 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4086 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4087 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4088 4089 PetscFunctionReturn(PETSC_SUCCESS); 4090 } 4091 4092 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 4093 { 4094 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 4095 4096 PetscFunctionBegin; 4097 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4098 delete cusp->cooPerm; 4099 delete cusp->cooPerm_a; 4100 cusp->cooPerm = NULL; 4101 cusp->cooPerm_a = NULL; 4102 if (cusp->use_extended_coo) { 4103 PetscCallCUDA(cudaFree(cusp->jmap_d)); 4104 PetscCallCUDA(cudaFree(cusp->perm_d)); 4105 } 4106 cusp->use_extended_coo = PETSC_FALSE; 4107 PetscFunctionReturn(PETSC_SUCCESS); 4108 } 4109 4110 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 4111 { 4112 PetscFunctionBegin; 4113 if (*cusparsestruct) { 4114 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 4115 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 4116 delete (*cusparsestruct)->workVector; 4117 delete (*cusparsestruct)->rowoffsets_gpu; 4118 delete (*cusparsestruct)->cooPerm; 4119 delete (*cusparsestruct)->cooPerm_a; 4120 delete (*cusparsestruct)->csr2csc_i; 4121 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 4122 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 4123 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 4124 PetscCall(PetscFree(*cusparsestruct)); 4125 } 4126 PetscFunctionReturn(PETSC_SUCCESS); 4127 } 4128 4129 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4130 { 4131 PetscFunctionBegin; 4132 if (*mat) { 4133 delete (*mat)->values; 4134 delete (*mat)->column_indices; 4135 delete (*mat)->row_offsets; 4136 delete *mat; 4137 *mat = 0; 4138 } 4139 PetscFunctionReturn(PETSC_SUCCESS); 4140 } 4141 4142 #if PETSC_PKG_CUDA_VERSION_LT(11, 3, 0) 4143 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4144 { 4145 PetscFunctionBegin; 4146 if (*trifactor) { 4147 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4148 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4149 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4150 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4151 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4152 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4153 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4154 #endif 4155 PetscCall(PetscFree(*trifactor)); 4156 } 4157 PetscFunctionReturn(PETSC_SUCCESS); 4158 } 4159 #endif 4160 4161 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4162 { 4163 CsrMatrix *mat; 4164 4165 PetscFunctionBegin; 4166 if (*matstruct) { 4167 if ((*matstruct)->mat) { 4168 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4169 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4170 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4171 #else 4172 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4173 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4174 #endif 4175 } else { 4176 mat = (CsrMatrix *)(*matstruct)->mat; 4177 PetscCall(CsrMatrix_Destroy(&mat)); 4178 } 4179 } 4180 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4181 delete (*matstruct)->cprowIndices; 4182 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4183 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4184 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4185 4186 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4187 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4188 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4189 for (int i = 0; i < 3; i++) { 4190 if (mdata->cuSpMV[i].initialized) { 4191 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4192 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4193 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4194 } 4195 } 4196 #endif 4197 delete *matstruct; 4198 *matstruct = NULL; 4199 } 4200 PetscFunctionReturn(PETSC_SUCCESS); 4201 } 4202 4203 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4204 { 4205 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4206 4207 PetscFunctionBegin; 4208 if (fs) { 4209 #if PETSC_PKG_CUDA_VERSION_LT(11, 3, 0) 4210 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4211 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4212 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4213 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4214 delete fs->workVector; 4215 fs->workVector = NULL; 4216 #endif 4217 delete fs->rpermIndices; 4218 delete fs->cpermIndices; 4219 fs->rpermIndices = NULL; 4220 fs->cpermIndices = NULL; 4221 if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4222 if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4223 fs->init_dev_prop = PETSC_FALSE; 4224 #if PETSC_PKG_CUDA_VERSION_GE(11, 3, 0) 4225 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4226 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4227 PetscCallCUDA(cudaFree(fs->csrVal)); 4228 PetscCallCUDA(cudaFree(fs->diag)); 4229 PetscCallCUDA(cudaFree(fs->X)); 4230 PetscCallCUDA(cudaFree(fs->Y)); 4231 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4232 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4233 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4234 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4235 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4236 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4237 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4238 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4239 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4240 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4241 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4242 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4243 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4244 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4245 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4246 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4247 PetscCall(PetscFree(fs->csrRowPtr_h)); 4248 PetscCall(PetscFree(fs->csrVal_h)); 4249 PetscCall(PetscFree(fs->diag_h)); 4250 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4251 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4252 #endif 4253 } 4254 PetscFunctionReturn(PETSC_SUCCESS); 4255 } 4256 4257 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4258 { 4259 PetscFunctionBegin; 4260 if (*trifactors) { 4261 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4262 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4263 PetscCall(PetscFree(*trifactors)); 4264 } 4265 PetscFunctionReturn(PETSC_SUCCESS); 4266 } 4267 4268 struct IJCompare { 4269 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4270 { 4271 if (t1.get<0>() < t2.get<0>()) return true; 4272 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4273 return false; 4274 } 4275 }; 4276 4277 struct IJEqual { 4278 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4279 { 4280 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 4281 return true; 4282 } 4283 }; 4284 4285 struct IJDiff { 4286 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 4287 }; 4288 4289 struct IJSum { 4290 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 4291 }; 4292 4293 #include <thrust/iterator/discard_iterator.h> 4294 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4295 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4296 { 4297 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4298 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4299 THRUSTARRAY *cooPerm_v = NULL; 4300 thrust::device_ptr<const PetscScalar> d_v; 4301 CsrMatrix *matrix; 4302 PetscInt n; 4303 4304 PetscFunctionBegin; 4305 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 4306 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 4307 if (!cusp->cooPerm) { 4308 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4309 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4310 PetscFunctionReturn(PETSC_SUCCESS); 4311 } 4312 matrix = (CsrMatrix *)cusp->mat->mat; 4313 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4314 if (!v) { 4315 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4316 goto finalize; 4317 } 4318 n = cusp->cooPerm->size(); 4319 if (isCudaMem(v)) { 4320 d_v = thrust::device_pointer_cast(v); 4321 } else { 4322 cooPerm_v = new THRUSTARRAY(n); 4323 cooPerm_v->assign(v, v + n); 4324 d_v = cooPerm_v->data(); 4325 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4326 } 4327 PetscCall(PetscLogGpuTimeBegin()); 4328 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4329 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4330 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4331 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4332 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4333 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4334 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4335 */ 4336 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4337 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4338 delete cooPerm_w; 4339 } else { 4340 /* all nonzeros in d_v[] are unique entries */ 4341 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4342 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4343 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4344 } 4345 } else { 4346 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4347 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4348 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4349 } else { 4350 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4351 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4352 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4353 } 4354 } 4355 PetscCall(PetscLogGpuTimeEnd()); 4356 finalize: 4357 delete cooPerm_v; 4358 A->offloadmask = PETSC_OFFLOAD_GPU; 4359 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4360 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4361 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4362 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4363 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4364 a->reallocs = 0; 4365 A->info.mallocs += 0; 4366 A->info.nz_unneeded = 0; 4367 A->assembled = A->was_assembled = PETSC_TRUE; 4368 A->num_ass++; 4369 PetscFunctionReturn(PETSC_SUCCESS); 4370 } 4371 4372 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4373 { 4374 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4375 4376 PetscFunctionBegin; 4377 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4378 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4379 if (destroy) { 4380 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4381 delete cusp->csr2csc_i; 4382 cusp->csr2csc_i = NULL; 4383 } 4384 A->transupdated = PETSC_FALSE; 4385 PetscFunctionReturn(PETSC_SUCCESS); 4386 } 4387 4388 #include <thrust/binary_search.h> 4389 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4390 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4391 { 4392 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4393 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4394 PetscInt cooPerm_n, nzr = 0; 4395 4396 PetscFunctionBegin; 4397 PetscCall(PetscLayoutSetUp(A->rmap)); 4398 PetscCall(PetscLayoutSetUp(A->cmap)); 4399 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4400 if (n != cooPerm_n) { 4401 delete cusp->cooPerm; 4402 delete cusp->cooPerm_a; 4403 cusp->cooPerm = NULL; 4404 cusp->cooPerm_a = NULL; 4405 } 4406 if (n) { 4407 thrust::device_ptr<PetscInt> d_i, d_j; 4408 PetscInt *d_raw_i, *d_raw_j; 4409 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4410 PetscMemType imtype, jmtype; 4411 4412 PetscCall(PetscGetMemType(coo_i, &imtype)); 4413 if (PetscMemTypeHost(imtype)) { 4414 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4415 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4416 d_i = thrust::device_pointer_cast(d_raw_i); 4417 free_raw_i = PETSC_TRUE; 4418 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4419 } else { 4420 d_i = thrust::device_pointer_cast(coo_i); 4421 } 4422 4423 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4424 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4425 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4426 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4427 d_j = thrust::device_pointer_cast(d_raw_j); 4428 free_raw_j = PETSC_TRUE; 4429 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4430 } else { 4431 d_j = thrust::device_pointer_cast(coo_j); 4432 } 4433 4434 THRUSTINTARRAY ii(A->rmap->n); 4435 4436 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4437 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4438 4439 /* Ex. 4440 n = 6 4441 coo_i = [3,3,1,4,1,4] 4442 coo_j = [3,2,2,5,2,6] 4443 */ 4444 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4445 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4446 4447 PetscCall(PetscLogGpuTimeBegin()); 4448 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4449 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4450 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4451 THRUSTINTARRAY w(d_j, d_j + n); 4452 4453 /* 4454 d_i = [1,1,3,3,4,4] 4455 d_j = [2,2,2,3,5,6] 4456 cooPerm = [2,4,1,0,3,5] 4457 */ 4458 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4459 4460 /* 4461 d_i = [1,3,3,4,4,x] 4462 ^ekey 4463 d_j = [2,2,3,5,6,x] 4464 ^nekye 4465 */ 4466 if (nekey == ekey) { /* all entries are unique */ 4467 delete cusp->cooPerm_a; 4468 cusp->cooPerm_a = NULL; 4469 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4470 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4471 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4472 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4473 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4474 w[0] = 0; 4475 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4476 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4477 } 4478 thrust::counting_iterator<PetscInt> search_begin(0); 4479 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4480 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4481 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4482 PetscCall(PetscLogGpuTimeEnd()); 4483 4484 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4485 a->singlemalloc = PETSC_FALSE; 4486 a->free_a = PETSC_TRUE; 4487 a->free_ij = PETSC_TRUE; 4488 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4489 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4490 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4491 a->nz = a->maxnz = a->i[A->rmap->n]; 4492 a->rmax = 0; 4493 PetscCall(PetscMalloc1(a->nz, &a->a)); 4494 PetscCall(PetscMalloc1(a->nz, &a->j)); 4495 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4496 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4497 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4498 for (PetscInt i = 0; i < A->rmap->n; i++) { 4499 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4500 nzr += (PetscInt) !!(nnzr); 4501 a->ilen[i] = a->imax[i] = nnzr; 4502 a->rmax = PetscMax(a->rmax, nnzr); 4503 } 4504 a->nonzerorowcnt = nzr; 4505 A->preallocated = PETSC_TRUE; 4506 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4507 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4508 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4509 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4510 } else { 4511 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4512 } 4513 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4514 4515 /* We want to allocate the CUSPARSE struct for matvec now. 4516 The code is so convoluted now that I prefer to copy zeros */ 4517 PetscCall(PetscArrayzero(a->a, a->nz)); 4518 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4519 A->offloadmask = PETSC_OFFLOAD_CPU; 4520 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4521 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4522 PetscFunctionReturn(PETSC_SUCCESS); 4523 } 4524 4525 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4526 { 4527 Mat_SeqAIJ *seq; 4528 Mat_SeqAIJCUSPARSE *dev; 4529 PetscBool coo_basic = PETSC_TRUE; 4530 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4531 4532 PetscFunctionBegin; 4533 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4534 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4535 if (coo_i) { 4536 PetscCall(PetscGetMemType(coo_i, &mtype)); 4537 if (PetscMemTypeHost(mtype)) { 4538 for (PetscCount k = 0; k < coo_n; k++) { 4539 if (coo_i[k] < 0 || coo_j[k] < 0) { 4540 coo_basic = PETSC_FALSE; 4541 break; 4542 } 4543 } 4544 } 4545 } 4546 4547 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4548 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4549 } else { 4550 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4551 mat->offloadmask = PETSC_OFFLOAD_CPU; 4552 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4553 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4554 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4555 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4556 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4557 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4558 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4559 dev->use_extended_coo = PETSC_TRUE; 4560 } 4561 PetscFunctionReturn(PETSC_SUCCESS); 4562 } 4563 4564 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4565 { 4566 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4567 const PetscCount grid_size = gridDim.x * blockDim.x; 4568 for (; i < nnz; i += grid_size) { 4569 PetscScalar sum = 0.0; 4570 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4571 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4572 } 4573 } 4574 4575 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4576 { 4577 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4578 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4579 PetscCount Annz = seq->nz; 4580 PetscMemType memtype; 4581 const PetscScalar *v1 = v; 4582 PetscScalar *Aa; 4583 4584 PetscFunctionBegin; 4585 if (dev->use_extended_coo) { 4586 PetscCall(PetscGetMemType(v, &memtype)); 4587 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4588 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4589 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4590 } 4591 4592 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4593 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4594 4595 if (Annz) { 4596 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4597 PetscCallCUDA(cudaPeekAtLastError()); 4598 } 4599 4600 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4601 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4602 4603 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4604 } else { 4605 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4606 } 4607 PetscFunctionReturn(PETSC_SUCCESS); 4608 } 4609 4610 /*@C 4611 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4612 4613 Not Collective 4614 4615 Input Parameters: 4616 + A - the matrix 4617 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4618 4619 Output Parameters: 4620 + i - the CSR row pointers 4621 - j - the CSR column indices 4622 4623 Level: developer 4624 4625 Note: 4626 When compressed is true, the CSR structure does not contain empty rows 4627 4628 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4629 @*/ 4630 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4631 { 4632 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4633 CsrMatrix *csr; 4634 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4635 4636 PetscFunctionBegin; 4637 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4638 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4639 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4640 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4641 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4642 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4643 csr = (CsrMatrix *)cusp->mat->mat; 4644 if (i) { 4645 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4646 if (!cusp->rowoffsets_gpu) { 4647 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4648 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4649 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4650 } 4651 *i = cusp->rowoffsets_gpu->data().get(); 4652 } else *i = csr->row_offsets->data().get(); 4653 } 4654 if (j) *j = csr->column_indices->data().get(); 4655 PetscFunctionReturn(PETSC_SUCCESS); 4656 } 4657 4658 /*@C 4659 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4660 4661 Not Collective 4662 4663 Input Parameters: 4664 + A - the matrix 4665 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4666 . i - the CSR row pointers 4667 - j - the CSR column indices 4668 4669 Level: developer 4670 4671 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4672 @*/ 4673 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4674 { 4675 PetscFunctionBegin; 4676 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4677 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4678 if (i) *i = NULL; 4679 if (j) *j = NULL; 4680 (void)compressed; 4681 PetscFunctionReturn(PETSC_SUCCESS); 4682 } 4683 4684 /*@C 4685 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4686 4687 Not Collective 4688 4689 Input Parameter: 4690 . A - a `MATSEQAIJCUSPARSE` matrix 4691 4692 Output Parameter: 4693 . a - pointer to the device data 4694 4695 Level: developer 4696 4697 Note: 4698 May trigger host-device copies if up-to-date matrix data is on host 4699 4700 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4701 @*/ 4702 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4703 { 4704 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4705 CsrMatrix *csr; 4706 4707 PetscFunctionBegin; 4708 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4709 PetscValidPointer(a, 2); 4710 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4711 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4712 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4713 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4714 csr = (CsrMatrix *)cusp->mat->mat; 4715 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4716 *a = csr->values->data().get(); 4717 PetscFunctionReturn(PETSC_SUCCESS); 4718 } 4719 4720 /*@C 4721 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4722 4723 Not Collective 4724 4725 Input Parameters: 4726 + A - a `MATSEQAIJCUSPARSE` matrix 4727 - a - pointer to the device data 4728 4729 Level: developer 4730 4731 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4732 @*/ 4733 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4734 { 4735 PetscFunctionBegin; 4736 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4737 PetscValidPointer(a, 2); 4738 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4739 *a = NULL; 4740 PetscFunctionReturn(PETSC_SUCCESS); 4741 } 4742 4743 /*@C 4744 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4745 4746 Not Collective 4747 4748 Input Parameter: 4749 . A - a `MATSEQAIJCUSPARSE` matrix 4750 4751 Output Parameter: 4752 . a - pointer to the device data 4753 4754 Level: developer 4755 4756 Note: 4757 May trigger host-device copies if up-to-date matrix data is on host 4758 4759 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4760 @*/ 4761 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4762 { 4763 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4764 CsrMatrix *csr; 4765 4766 PetscFunctionBegin; 4767 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4768 PetscValidPointer(a, 2); 4769 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4770 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4771 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4772 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4773 csr = (CsrMatrix *)cusp->mat->mat; 4774 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4775 *a = csr->values->data().get(); 4776 A->offloadmask = PETSC_OFFLOAD_GPU; 4777 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4778 PetscFunctionReturn(PETSC_SUCCESS); 4779 } 4780 /*@C 4781 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4782 4783 Not Collective 4784 4785 Input Parameters: 4786 + A - a `MATSEQAIJCUSPARSE` matrix 4787 - a - pointer to the device data 4788 4789 Level: developer 4790 4791 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4792 @*/ 4793 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4794 { 4795 PetscFunctionBegin; 4796 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4797 PetscValidPointer(a, 2); 4798 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4799 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4800 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4801 *a = NULL; 4802 PetscFunctionReturn(PETSC_SUCCESS); 4803 } 4804 4805 /*@C 4806 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4807 4808 Not Collective 4809 4810 Input Parameter: 4811 . A - a `MATSEQAIJCUSPARSE` matrix 4812 4813 Output Parameter: 4814 . a - pointer to the device data 4815 4816 Level: developer 4817 4818 Note: 4819 Does not trigger host-device copies and flags data validity on the GPU 4820 4821 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4822 @*/ 4823 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4824 { 4825 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4826 CsrMatrix *csr; 4827 4828 PetscFunctionBegin; 4829 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4830 PetscValidPointer(a, 2); 4831 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4832 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4833 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4834 csr = (CsrMatrix *)cusp->mat->mat; 4835 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4836 *a = csr->values->data().get(); 4837 A->offloadmask = PETSC_OFFLOAD_GPU; 4838 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4839 PetscFunctionReturn(PETSC_SUCCESS); 4840 } 4841 4842 /*@C 4843 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4844 4845 Not Collective 4846 4847 Input Parameters: 4848 + A - a `MATSEQAIJCUSPARSE` matrix 4849 - a - pointer to the device data 4850 4851 Level: developer 4852 4853 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4854 @*/ 4855 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4856 { 4857 PetscFunctionBegin; 4858 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4859 PetscValidPointer(a, 2); 4860 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4861 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4862 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4863 *a = NULL; 4864 PetscFunctionReturn(PETSC_SUCCESS); 4865 } 4866 4867 struct IJCompare4 { 4868 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4869 { 4870 if (t1.get<0>() < t2.get<0>()) return true; 4871 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4872 return false; 4873 } 4874 }; 4875 4876 struct Shift { 4877 int _shift; 4878 4879 Shift(int shift) : _shift(shift) { } 4880 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4881 }; 4882 4883 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4884 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4885 { 4886 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4887 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4888 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4889 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4890 PetscInt Annz, Bnnz; 4891 cusparseStatus_t stat; 4892 PetscInt i, m, n, zero = 0; 4893 4894 PetscFunctionBegin; 4895 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4896 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4897 PetscValidPointer(C, 4); 4898 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4899 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4900 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4901 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4902 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4903 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4904 if (reuse == MAT_INITIAL_MATRIX) { 4905 m = A->rmap->n; 4906 n = A->cmap->n + B->cmap->n; 4907 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4908 PetscCall(MatSetSizes(*C, m, n, m, n)); 4909 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4910 c = (Mat_SeqAIJ *)(*C)->data; 4911 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4912 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4913 Ccsr = new CsrMatrix; 4914 Cmat->cprowIndices = NULL; 4915 c->compressedrow.use = PETSC_FALSE; 4916 c->compressedrow.nrows = 0; 4917 c->compressedrow.i = NULL; 4918 c->compressedrow.rindex = NULL; 4919 Ccusp->workVector = NULL; 4920 Ccusp->nrows = m; 4921 Ccusp->mat = Cmat; 4922 Ccusp->mat->mat = Ccsr; 4923 Ccsr->num_rows = m; 4924 Ccsr->num_cols = n; 4925 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4926 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4927 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4928 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4929 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4930 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4931 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4932 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4933 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4934 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4935 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4936 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4937 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4938 4939 Acsr = (CsrMatrix *)Acusp->mat->mat; 4940 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4941 Annz = (PetscInt)Acsr->column_indices->size(); 4942 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4943 c->nz = Annz + Bnnz; 4944 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4945 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4946 Ccsr->values = new THRUSTARRAY(c->nz); 4947 Ccsr->num_entries = c->nz; 4948 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4949 if (c->nz) { 4950 auto Acoo = new THRUSTINTARRAY32(Annz); 4951 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4952 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4953 THRUSTINTARRAY32 *Aroff, *Broff; 4954 4955 if (a->compressedrow.use) { /* need full row offset */ 4956 if (!Acusp->rowoffsets_gpu) { 4957 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4958 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4959 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4960 } 4961 Aroff = Acusp->rowoffsets_gpu; 4962 } else Aroff = Acsr->row_offsets; 4963 if (b->compressedrow.use) { /* need full row offset */ 4964 if (!Bcusp->rowoffsets_gpu) { 4965 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4966 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4967 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4968 } 4969 Broff = Bcusp->rowoffsets_gpu; 4970 } else Broff = Bcsr->row_offsets; 4971 PetscCall(PetscLogGpuTimeBegin()); 4972 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4973 PetscCallCUSPARSE(stat); 4974 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4975 PetscCallCUSPARSE(stat); 4976 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4977 auto Aperm = thrust::make_constant_iterator(1); 4978 auto Bperm = thrust::make_constant_iterator(0); 4979 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4980 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4981 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4982 #else 4983 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4984 auto Bcib = Bcsr->column_indices->begin(); 4985 auto Bcie = Bcsr->column_indices->end(); 4986 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4987 #endif 4988 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4989 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4990 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4991 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4992 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4993 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4994 auto p1 = Ccusp->cooPerm->begin(); 4995 auto p2 = Ccusp->cooPerm->begin(); 4996 thrust::advance(p2, Annz); 4997 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4998 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4999 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 5000 #endif 5001 auto cci = thrust::make_counting_iterator(zero); 5002 auto cce = thrust::make_counting_iterator(c->nz); 5003 #if 0 //Errors on SUMMIT cuda 11.1.0 5004 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 5005 #else 5006 auto pred = thrust::identity<int>(); 5007 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 5008 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 5009 #endif 5010 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 5011 PetscCallCUSPARSE(stat); 5012 PetscCall(PetscLogGpuTimeEnd()); 5013 delete wPerm; 5014 delete Acoo; 5015 delete Bcoo; 5016 delete Ccoo; 5017 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 5018 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 5019 PetscCallCUSPARSE(stat); 5020 #endif 5021 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 5022 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 5023 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 5024 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5025 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 5026 CsrMatrix *CcsrT = new CsrMatrix; 5027 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 5028 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 5029 5030 (*C)->form_explicit_transpose = PETSC_TRUE; 5031 (*C)->transupdated = PETSC_TRUE; 5032 Ccusp->rowoffsets_gpu = NULL; 5033 CmatT->cprowIndices = NULL; 5034 CmatT->mat = CcsrT; 5035 CcsrT->num_rows = n; 5036 CcsrT->num_cols = m; 5037 CcsrT->num_entries = c->nz; 5038 5039 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 5040 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 5041 CcsrT->values = new THRUSTARRAY(c->nz); 5042 5043 PetscCall(PetscLogGpuTimeBegin()); 5044 auto rT = CcsrT->row_offsets->begin(); 5045 if (AT) { 5046 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 5047 thrust::advance(rT, -1); 5048 } 5049 if (BT) { 5050 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 5051 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 5052 thrust::copy(titb, tite, rT); 5053 } 5054 auto cT = CcsrT->column_indices->begin(); 5055 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 5056 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 5057 auto vT = CcsrT->values->begin(); 5058 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 5059 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 5060 PetscCall(PetscLogGpuTimeEnd()); 5061 5062 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 5063 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 5064 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5065 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 5066 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 5067 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 5068 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 5069 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 5070 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 5071 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 5072 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 5073 PetscCallCUSPARSE(stat); 5074 #endif 5075 Ccusp->matTranspose = CmatT; 5076 } 5077 } 5078 5079 c->singlemalloc = PETSC_FALSE; 5080 c->free_a = PETSC_TRUE; 5081 c->free_ij = PETSC_TRUE; 5082 PetscCall(PetscMalloc1(m + 1, &c->i)); 5083 PetscCall(PetscMalloc1(c->nz, &c->j)); 5084 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 5085 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 5086 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 5087 ii = *Ccsr->row_offsets; 5088 jj = *Ccsr->column_indices; 5089 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5090 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5091 } else { 5092 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5093 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5094 } 5095 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 5096 PetscCall(PetscMalloc1(m, &c->ilen)); 5097 PetscCall(PetscMalloc1(m, &c->imax)); 5098 c->maxnz = c->nz; 5099 c->nonzerorowcnt = 0; 5100 c->rmax = 0; 5101 for (i = 0; i < m; i++) { 5102 const PetscInt nn = c->i[i + 1] - c->i[i]; 5103 c->ilen[i] = c->imax[i] = nn; 5104 c->nonzerorowcnt += (PetscInt) !!nn; 5105 c->rmax = PetscMax(c->rmax, nn); 5106 } 5107 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 5108 PetscCall(PetscMalloc1(c->nz, &c->a)); 5109 (*C)->nonzerostate++; 5110 PetscCall(PetscLayoutSetUp((*C)->rmap)); 5111 PetscCall(PetscLayoutSetUp((*C)->cmap)); 5112 Ccusp->nonzerostate = (*C)->nonzerostate; 5113 (*C)->preallocated = PETSC_TRUE; 5114 } else { 5115 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 5116 c = (Mat_SeqAIJ *)(*C)->data; 5117 if (c->nz) { 5118 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 5119 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 5120 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 5121 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 5122 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5123 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5124 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 5125 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 5126 Acsr = (CsrMatrix *)Acusp->mat->mat; 5127 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 5128 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 5129 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 5130 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 5131 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 5132 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 5133 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 5134 auto pmid = Ccusp->cooPerm->begin(); 5135 thrust::advance(pmid, Acsr->num_entries); 5136 PetscCall(PetscLogGpuTimeBegin()); 5137 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 5138 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 5139 thrust::for_each(zibait, zieait, VecCUDAEquals()); 5140 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 5141 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 5142 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 5143 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 5144 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 5145 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5146 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5147 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 5148 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 5149 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 5150 auto vT = CcsrT->values->begin(); 5151 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 5152 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 5153 (*C)->transupdated = PETSC_TRUE; 5154 } 5155 PetscCall(PetscLogGpuTimeEnd()); 5156 } 5157 } 5158 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5159 (*C)->assembled = PETSC_TRUE; 5160 (*C)->was_assembled = PETSC_FALSE; 5161 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5162 PetscFunctionReturn(PETSC_SUCCESS); 5163 } 5164 5165 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5166 { 5167 bool dmem; 5168 const PetscScalar *av; 5169 5170 PetscFunctionBegin; 5171 dmem = isCudaMem(v); 5172 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5173 if (n && idx) { 5174 THRUSTINTARRAY widx(n); 5175 widx.assign(idx, idx + n); 5176 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5177 5178 THRUSTARRAY *w = NULL; 5179 thrust::device_ptr<PetscScalar> dv; 5180 if (dmem) { 5181 dv = thrust::device_pointer_cast(v); 5182 } else { 5183 w = new THRUSTARRAY(n); 5184 dv = w->data(); 5185 } 5186 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5187 5188 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5189 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5190 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5191 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5192 delete w; 5193 } else { 5194 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5195 } 5196 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5197 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5198 PetscFunctionReturn(PETSC_SUCCESS); 5199 } 5200