1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30 31 typedef enum { 32 CUSPARSE_MV_ALG_DEFAULT = 0, 33 CUSPARSE_COOMV_ALG = 1, 34 CUSPARSE_CSRMV_ALG1 = 2, 35 CUSPARSE_CSRMV_ALG2 = 3 36 } cusparseSpMVAlg_t; 37 38 typedef enum { 39 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44 CUSPARSE_SPMM_ALG_DEFAULT = 0, 45 CUSPARSE_SPMM_COO_ALG1 = 1, 46 CUSPARSE_SPMM_COO_ALG2 = 2, 47 CUSPARSE_SPMM_COO_ALG3 = 3, 48 CUSPARSE_SPMM_COO_ALG4 = 5, 49 CUSPARSE_SPMM_CSR_ALG1 = 4, 50 CUSPARSE_SPMM_CSR_ALG2 = 6, 51 } cusparseSpMMAlg_t; 52 53 typedef enum { 54 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 55 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 56 } cusparseCsr2CscAlg_t; 57 */ 58 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61 #endif 62 63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 67 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 73 #endif 74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 84 85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 87 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 88 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 89 90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 92 93 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 94 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 95 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 96 97 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 98 { 99 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 100 101 PetscFunctionBegin; 102 switch (op) { 103 case MAT_CUSPARSE_MULT: 104 cusparsestruct->format = format; 105 break; 106 case MAT_CUSPARSE_ALL: 107 cusparsestruct->format = format; 108 break; 109 default: 110 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 111 } 112 PetscFunctionReturn(PETSC_SUCCESS); 113 } 114 115 /*@ 116 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 117 operation. Only the `MatMult()` operation can use different GPU storage formats 118 119 Not Collective 120 121 Input Parameters: 122 + A - Matrix of type `MATSEQAIJCUSPARSE` 123 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 124 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 125 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 126 127 Level: intermediate 128 129 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 130 @*/ 131 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 132 { 133 PetscFunctionBegin; 134 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 135 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 140 { 141 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 142 143 PetscFunctionBegin; 144 cusparsestruct->use_cpu_solve = use_cpu; 145 PetscFunctionReturn(PETSC_SUCCESS); 146 } 147 148 /*@ 149 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 150 151 Input Parameters: 152 + A - Matrix of type `MATSEQAIJCUSPARSE` 153 - use_cpu - set flag for using the built-in CPU `MatSolve()` 154 155 Level: intermediate 156 157 Note: 158 The cuSparse LU solver currently computes the factors with the built-in CPU method 159 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 160 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 161 162 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 163 @*/ 164 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 165 { 166 PetscFunctionBegin; 167 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 168 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171 172 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 173 { 174 PetscFunctionBegin; 175 switch (op) { 176 case MAT_FORM_EXPLICIT_TRANSPOSE: 177 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 178 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 179 A->form_explicit_transpose = flg; 180 break; 181 default: 182 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 183 break; 184 } 185 PetscFunctionReturn(PETSC_SUCCESS); 186 } 187 188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 189 { 190 MatCUSPARSEStorageFormat format; 191 PetscBool flg; 192 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 193 194 PetscFunctionBegin; 195 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 196 if (A->factortype == MAT_FACTOR_NONE) { 197 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 198 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 199 200 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 201 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 202 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 203 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 205 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 206 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 207 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 208 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 209 #else 210 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 211 #endif 212 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 213 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 214 215 PetscCall( 216 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 217 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 218 #endif 219 } 220 PetscOptionsHeadEnd(); 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 225 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 226 { 227 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 228 PetscInt m = A->rmap->n; 229 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 230 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 231 const MatScalar *Aa = a->a; 232 PetscInt *Mi, *Mj, Mnz; 233 PetscScalar *Ma; 234 235 PetscFunctionBegin; 236 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 237 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 238 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 239 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 240 PetscCall(PetscMalloc1(m + 1, &Mi)); 241 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 242 PetscCall(PetscMalloc1(Mnz, &Ma)); 243 Mi[0] = 0; 244 for (PetscInt i = 0; i < m; i++) { 245 PetscInt llen = Ai[i + 1] - Ai[i]; 246 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 247 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 248 Mj[Mi[i] + llen] = i; // diagonal entry 249 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 250 Mi[i + 1] = Mi[i] + llen + ulen; 251 } 252 // Copy M (L,U) from host to device 253 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 254 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 255 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 256 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 257 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 258 259 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 260 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 261 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 262 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 263 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 264 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 265 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 266 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 267 268 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 269 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 270 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 271 272 fillMode = CUSPARSE_FILL_MODE_UPPER; 273 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 274 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 275 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 276 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 277 278 // Allocate work vectors in SpSv 279 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 280 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 281 282 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 283 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 284 285 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 286 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 287 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 288 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 289 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 290 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 291 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 292 293 // Record for reuse 294 fs->csrRowPtr_h = Mi; 295 fs->csrVal_h = Ma; 296 PetscCall(PetscFree(Mj)); 297 } 298 // Copy the value 299 Mi = fs->csrRowPtr_h; 300 Ma = fs->csrVal_h; 301 Mnz = Mi[m]; 302 for (PetscInt i = 0; i < m; i++) { 303 PetscInt llen = Ai[i + 1] - Ai[i]; 304 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 305 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 306 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 307 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 308 } 309 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 310 311 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 312 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 313 314 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 315 316 // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve 317 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 318 } 319 PetscFunctionReturn(PETSC_SUCCESS); 320 } 321 #else 322 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 323 { 324 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 325 PetscInt n = A->rmap->n; 326 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 327 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 328 const PetscInt *ai = a->i, *aj = a->j, *vi; 329 const MatScalar *aa = a->a, *v; 330 PetscInt *AiLo, *AjLo; 331 PetscInt i, nz, nzLower, offset, rowOffset; 332 333 PetscFunctionBegin; 334 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 335 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 336 try { 337 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 338 nzLower = n + ai[n] - ai[1]; 339 if (!loTriFactor) { 340 PetscScalar *AALo; 341 342 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 343 344 /* Allocate Space for the lower triangular matrix */ 345 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 346 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 347 348 /* Fill the lower triangular matrix */ 349 AiLo[0] = (PetscInt)0; 350 AiLo[n] = nzLower; 351 AjLo[0] = (PetscInt)0; 352 AALo[0] = (MatScalar)1.0; 353 v = aa; 354 vi = aj; 355 offset = 1; 356 rowOffset = 1; 357 for (i = 1; i < n; i++) { 358 nz = ai[i + 1] - ai[i]; 359 /* additional 1 for the term on the diagonal */ 360 AiLo[i] = rowOffset; 361 rowOffset += nz + 1; 362 363 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 364 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 365 366 offset += nz; 367 AjLo[offset] = (PetscInt)i; 368 AALo[offset] = (MatScalar)1.0; 369 offset += 1; 370 371 v += nz; 372 vi += nz; 373 } 374 375 /* allocate space for the triangular factor information */ 376 PetscCall(PetscNew(&loTriFactor)); 377 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 378 /* Create the matrix description */ 379 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 380 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 381 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 382 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 383 #else 384 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 385 #endif 386 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 387 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 388 389 /* set the operation */ 390 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 391 392 /* set the matrix */ 393 loTriFactor->csrMat = new CsrMatrix; 394 loTriFactor->csrMat->num_rows = n; 395 loTriFactor->csrMat->num_cols = n; 396 loTriFactor->csrMat->num_entries = nzLower; 397 398 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 399 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 400 401 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 402 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 403 404 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 405 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 406 407 /* Create the solve analysis information */ 408 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 409 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 410 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 411 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 412 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 413 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 414 #endif 415 416 /* perform the solve analysis */ 417 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 418 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 419 PetscCallCUDA(WaitForCUDA()); 420 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 421 422 /* assign the pointer */ 423 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 424 loTriFactor->AA_h = AALo; 425 PetscCallCUDA(cudaFreeHost(AiLo)); 426 PetscCallCUDA(cudaFreeHost(AjLo)); 427 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 428 } else { /* update values only */ 429 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 430 /* Fill the lower triangular matrix */ 431 loTriFactor->AA_h[0] = 1.0; 432 v = aa; 433 vi = aj; 434 offset = 1; 435 for (i = 1; i < n; i++) { 436 nz = ai[i + 1] - ai[i]; 437 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 438 offset += nz; 439 loTriFactor->AA_h[offset] = 1.0; 440 offset += 1; 441 v += nz; 442 } 443 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 444 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 445 } 446 } catch (char *ex) { 447 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 448 } 449 } 450 PetscFunctionReturn(PETSC_SUCCESS); 451 } 452 453 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 454 { 455 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 456 PetscInt n = A->rmap->n; 457 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 458 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 459 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 460 const MatScalar *aa = a->a, *v; 461 PetscInt *AiUp, *AjUp; 462 PetscInt i, nz, nzUpper, offset; 463 464 PetscFunctionBegin; 465 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 466 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 467 try { 468 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 469 nzUpper = adiag[0] - adiag[n]; 470 if (!upTriFactor) { 471 PetscScalar *AAUp; 472 473 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 474 475 /* Allocate Space for the upper triangular matrix */ 476 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 477 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 478 479 /* Fill the upper triangular matrix */ 480 AiUp[0] = (PetscInt)0; 481 AiUp[n] = nzUpper; 482 offset = nzUpper; 483 for (i = n - 1; i >= 0; i--) { 484 v = aa + adiag[i + 1] + 1; 485 vi = aj + adiag[i + 1] + 1; 486 487 /* number of elements NOT on the diagonal */ 488 nz = adiag[i] - adiag[i + 1] - 1; 489 490 /* decrement the offset */ 491 offset -= (nz + 1); 492 493 /* first, set the diagonal elements */ 494 AjUp[offset] = (PetscInt)i; 495 AAUp[offset] = (MatScalar)1. / v[nz]; 496 AiUp[i] = AiUp[i + 1] - (nz + 1); 497 498 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 499 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 500 } 501 502 /* allocate space for the triangular factor information */ 503 PetscCall(PetscNew(&upTriFactor)); 504 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 505 506 /* Create the matrix description */ 507 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 508 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 509 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 510 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 511 #else 512 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 513 #endif 514 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 515 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 516 517 /* set the operation */ 518 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 519 520 /* set the matrix */ 521 upTriFactor->csrMat = new CsrMatrix; 522 upTriFactor->csrMat->num_rows = n; 523 upTriFactor->csrMat->num_cols = n; 524 upTriFactor->csrMat->num_entries = nzUpper; 525 526 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 527 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 528 529 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 530 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 531 532 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 533 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 534 535 /* Create the solve analysis information */ 536 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 537 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 538 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 539 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 540 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 541 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 542 #endif 543 544 /* perform the solve analysis */ 545 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 546 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 547 548 PetscCallCUDA(WaitForCUDA()); 549 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 550 551 /* assign the pointer */ 552 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 553 upTriFactor->AA_h = AAUp; 554 PetscCallCUDA(cudaFreeHost(AiUp)); 555 PetscCallCUDA(cudaFreeHost(AjUp)); 556 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 557 } else { 558 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 559 /* Fill the upper triangular matrix */ 560 offset = nzUpper; 561 for (i = n - 1; i >= 0; i--) { 562 v = aa + adiag[i + 1] + 1; 563 564 /* number of elements NOT on the diagonal */ 565 nz = adiag[i] - adiag[i + 1] - 1; 566 567 /* decrement the offset */ 568 offset -= (nz + 1); 569 570 /* first, set the diagonal elements */ 571 upTriFactor->AA_h[offset] = 1. / v[nz]; 572 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 573 } 574 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 575 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 576 } 577 } catch (char *ex) { 578 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 579 } 580 } 581 PetscFunctionReturn(PETSC_SUCCESS); 582 } 583 #endif 584 585 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 586 { 587 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 588 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 589 IS isrow = a->row, iscol = a->icol; 590 PetscBool row_identity, col_identity; 591 PetscInt n = A->rmap->n; 592 593 PetscFunctionBegin; 594 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 595 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 596 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 597 #else 598 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 599 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 600 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 601 #endif 602 603 cusparseTriFactors->nnz = a->nz; 604 605 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 606 /* lower triangular indices */ 607 PetscCall(ISIdentity(isrow, &row_identity)); 608 if (!row_identity && !cusparseTriFactors->rpermIndices) { 609 const PetscInt *r; 610 611 PetscCall(ISGetIndices(isrow, &r)); 612 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 613 cusparseTriFactors->rpermIndices->assign(r, r + n); 614 PetscCall(ISRestoreIndices(isrow, &r)); 615 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 616 } 617 618 /* upper triangular indices */ 619 PetscCall(ISIdentity(iscol, &col_identity)); 620 if (!col_identity && !cusparseTriFactors->cpermIndices) { 621 const PetscInt *c; 622 623 PetscCall(ISGetIndices(iscol, &c)); 624 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 625 cusparseTriFactors->cpermIndices->assign(c, c + n); 626 PetscCall(ISRestoreIndices(iscol, &c)); 627 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 628 } 629 PetscFunctionReturn(PETSC_SUCCESS); 630 } 631 632 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 633 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 634 { 635 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 636 PetscInt m = A->rmap->n; 637 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 638 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 639 const MatScalar *Aa = a->a; 640 PetscInt *Mj, Mnz; 641 PetscScalar *Ma, *D; 642 643 PetscFunctionBegin; 644 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 645 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 646 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 647 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 648 Mnz = Ai[m]; // Unz (with the unit diagonal) 649 PetscCall(PetscMalloc1(Mnz, &Ma)); 650 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 651 PetscCall(PetscMalloc1(m, &D)); // the diagonal 652 for (PetscInt i = 0; i < m; i++) { 653 PetscInt ulen = Ai[i + 1] - Ai[i]; 654 Mj[Ai[i]] = i; // diagonal entry 655 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 656 } 657 // Copy M (U) from host to device 658 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 659 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 660 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 661 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 662 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 663 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 664 665 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 666 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 667 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 668 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 669 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 670 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 671 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 672 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 673 674 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 675 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 676 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 677 678 // Allocate work vectors in SpSv 679 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 680 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 681 682 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 683 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 684 685 // Query buffer sizes for SpSV and then allocate buffers 686 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 687 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 688 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 689 690 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 691 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 692 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 693 694 // Record for reuse 695 fs->csrVal_h = Ma; 696 fs->diag_h = D; 697 PetscCall(PetscFree(Mj)); 698 } 699 // Copy the value 700 Ma = fs->csrVal_h; 701 D = fs->diag_h; 702 Mnz = Ai[m]; 703 for (PetscInt i = 0; i < m; i++) { 704 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 705 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 706 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 707 } 708 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 709 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 710 711 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 712 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 713 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 714 } 715 PetscFunctionReturn(PETSC_SUCCESS); 716 } 717 718 // Solve Ut D U x = b 719 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 720 { 721 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 722 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 723 const PetscScalar *barray; 724 PetscScalar *xarray; 725 thrust::device_ptr<const PetscScalar> bGPU; 726 thrust::device_ptr<PetscScalar> xGPU; 727 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 728 PetscInt m = A->rmap->n; 729 730 PetscFunctionBegin; 731 PetscCall(PetscLogGpuTimeBegin()); 732 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 733 PetscCall(VecCUDAGetArrayRead(b, &barray)); 734 xGPU = thrust::device_pointer_cast(xarray); 735 bGPU = thrust::device_pointer_cast(barray); 736 737 // Reorder b with the row permutation if needed, and wrap the result in fs->X 738 if (fs->rpermIndices) { 739 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 740 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 741 } else { 742 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 743 } 744 745 // Solve Ut Y = X 746 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 747 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 748 749 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 750 // It is basically a vector element-wise multiplication, but cublas does not have it! 751 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 752 753 // Solve U X = Y 754 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 755 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 756 } else { 757 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 758 } 759 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 760 761 // Reorder X with the column permutation if needed, and put the result back to x 762 if (fs->cpermIndices) { 763 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 764 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 765 } 766 767 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 768 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 769 PetscCall(PetscLogGpuTimeEnd()); 770 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 771 PetscFunctionReturn(PETSC_SUCCESS); 772 } 773 #else 774 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 775 { 776 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 777 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 778 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 779 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 780 PetscInt *AiUp, *AjUp; 781 PetscScalar *AAUp; 782 PetscScalar *AALo; 783 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 784 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 785 const PetscInt *ai = b->i, *aj = b->j, *vj; 786 const MatScalar *aa = b->a, *v; 787 788 PetscFunctionBegin; 789 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 790 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 791 try { 792 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 793 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 794 if (!upTriFactor && !loTriFactor) { 795 /* Allocate Space for the upper triangular matrix */ 796 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 797 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 798 799 /* Fill the upper triangular matrix */ 800 AiUp[0] = (PetscInt)0; 801 AiUp[n] = nzUpper; 802 offset = 0; 803 for (i = 0; i < n; i++) { 804 /* set the pointers */ 805 v = aa + ai[i]; 806 vj = aj + ai[i]; 807 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 808 809 /* first, set the diagonal elements */ 810 AjUp[offset] = (PetscInt)i; 811 AAUp[offset] = (MatScalar)1.0 / v[nz]; 812 AiUp[i] = offset; 813 AALo[offset] = (MatScalar)1.0 / v[nz]; 814 815 offset += 1; 816 if (nz > 0) { 817 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 818 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 819 for (j = offset; j < offset + nz; j++) { 820 AAUp[j] = -AAUp[j]; 821 AALo[j] = AAUp[j] / v[nz]; 822 } 823 offset += nz; 824 } 825 } 826 827 /* allocate space for the triangular factor information */ 828 PetscCall(PetscNew(&upTriFactor)); 829 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 830 831 /* Create the matrix description */ 832 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 833 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 834 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 835 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 836 #else 837 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 838 #endif 839 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 840 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 841 842 /* set the matrix */ 843 upTriFactor->csrMat = new CsrMatrix; 844 upTriFactor->csrMat->num_rows = A->rmap->n; 845 upTriFactor->csrMat->num_cols = A->cmap->n; 846 upTriFactor->csrMat->num_entries = a->nz; 847 848 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 849 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 850 851 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 852 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 853 854 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 855 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 856 857 /* set the operation */ 858 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 859 860 /* Create the solve analysis information */ 861 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 862 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 863 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 864 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 865 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 866 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 867 #endif 868 869 /* perform the solve analysis */ 870 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 871 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 872 873 PetscCallCUDA(WaitForCUDA()); 874 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 875 876 /* assign the pointer */ 877 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 878 879 /* allocate space for the triangular factor information */ 880 PetscCall(PetscNew(&loTriFactor)); 881 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 882 883 /* Create the matrix description */ 884 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 885 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 886 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 887 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 888 #else 889 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 890 #endif 891 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 892 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 893 894 /* set the operation */ 895 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 896 897 /* set the matrix */ 898 loTriFactor->csrMat = new CsrMatrix; 899 loTriFactor->csrMat->num_rows = A->rmap->n; 900 loTriFactor->csrMat->num_cols = A->cmap->n; 901 loTriFactor->csrMat->num_entries = a->nz; 902 903 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 904 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 905 906 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 907 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 908 909 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 910 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 911 912 /* Create the solve analysis information */ 913 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 914 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 915 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 916 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 917 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 918 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 919 #endif 920 921 /* perform the solve analysis */ 922 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 923 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 924 925 PetscCallCUDA(WaitForCUDA()); 926 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 927 928 /* assign the pointer */ 929 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 930 931 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 932 PetscCallCUDA(cudaFreeHost(AiUp)); 933 PetscCallCUDA(cudaFreeHost(AjUp)); 934 } else { 935 /* Fill the upper triangular matrix */ 936 offset = 0; 937 for (i = 0; i < n; i++) { 938 /* set the pointers */ 939 v = aa + ai[i]; 940 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 941 942 /* first, set the diagonal elements */ 943 AAUp[offset] = 1.0 / v[nz]; 944 AALo[offset] = 1.0 / v[nz]; 945 946 offset += 1; 947 if (nz > 0) { 948 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 949 for (j = offset; j < offset + nz; j++) { 950 AAUp[j] = -AAUp[j]; 951 AALo[j] = AAUp[j] / v[nz]; 952 } 953 offset += nz; 954 } 955 } 956 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 957 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 958 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 959 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 960 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 961 } 962 PetscCallCUDA(cudaFreeHost(AAUp)); 963 PetscCallCUDA(cudaFreeHost(AALo)); 964 } catch (char *ex) { 965 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 966 } 967 } 968 PetscFunctionReturn(PETSC_SUCCESS); 969 } 970 #endif 971 972 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 973 { 974 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 975 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 976 IS ip = a->row; 977 PetscBool perm_identity; 978 PetscInt n = A->rmap->n; 979 980 PetscFunctionBegin; 981 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 982 983 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 984 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 985 #else 986 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 987 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 988 #endif 989 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 990 991 A->offloadmask = PETSC_OFFLOAD_BOTH; 992 993 /* lower triangular indices */ 994 PetscCall(ISIdentity(ip, &perm_identity)); 995 if (!perm_identity) { 996 IS iip; 997 const PetscInt *irip, *rip; 998 999 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 1000 PetscCall(ISGetIndices(iip, &irip)); 1001 PetscCall(ISGetIndices(ip, &rip)); 1002 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1003 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1004 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1005 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1006 PetscCall(ISRestoreIndices(iip, &irip)); 1007 PetscCall(ISDestroy(&iip)); 1008 PetscCall(ISRestoreIndices(ip, &rip)); 1009 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1010 } 1011 PetscFunctionReturn(PETSC_SUCCESS); 1012 } 1013 1014 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1015 { 1016 PetscFunctionBegin; 1017 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1018 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1019 B->offloadmask = PETSC_OFFLOAD_CPU; 1020 1021 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1022 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1023 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1024 #else 1025 /* determine which version of MatSolve needs to be used. */ 1026 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1027 IS ip = b->row; 1028 PetscBool perm_identity; 1029 1030 PetscCall(ISIdentity(ip, &perm_identity)); 1031 if (perm_identity) { 1032 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1033 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1034 } else { 1035 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1036 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1037 } 1038 #endif 1039 B->ops->matsolve = NULL; 1040 B->ops->matsolvetranspose = NULL; 1041 1042 /* get the triangular factors */ 1043 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1044 PetscFunctionReturn(PETSC_SUCCESS); 1045 } 1046 1047 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1048 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1049 { 1050 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1051 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1052 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1053 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1054 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1055 cusparseIndexBase_t indexBase; 1056 cusparseMatrixType_t matrixType; 1057 cusparseFillMode_t fillMode; 1058 cusparseDiagType_t diagType; 1059 1060 PetscFunctionBegin; 1061 /* allocate space for the transpose of the lower triangular factor */ 1062 PetscCall(PetscNew(&loTriFactorT)); 1063 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1064 1065 /* set the matrix descriptors of the lower triangular factor */ 1066 matrixType = cusparseGetMatType(loTriFactor->descr); 1067 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1068 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1069 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1070 1071 /* Create the matrix description */ 1072 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1073 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1074 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1075 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1076 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1077 1078 /* set the operation */ 1079 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1080 1081 /* allocate GPU space for the CSC of the lower triangular factor*/ 1082 loTriFactorT->csrMat = new CsrMatrix; 1083 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1084 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1085 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1086 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1087 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1088 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1089 1090 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1091 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1092 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1093 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1094 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1095 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1096 #endif 1097 1098 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1099 { 1100 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1101 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1102 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1103 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1104 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1105 #else 1106 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1107 #endif 1108 PetscCallCUSPARSE(stat); 1109 } 1110 1111 PetscCallCUDA(WaitForCUDA()); 1112 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1113 1114 /* Create the solve analysis information */ 1115 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1116 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1117 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1118 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1119 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1120 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1121 #endif 1122 1123 /* perform the solve analysis */ 1124 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1125 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1126 1127 PetscCallCUDA(WaitForCUDA()); 1128 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1129 1130 /* assign the pointer */ 1131 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1132 1133 /*********************************************/ 1134 /* Now the Transpose of the Upper Tri Factor */ 1135 /*********************************************/ 1136 1137 /* allocate space for the transpose of the upper triangular factor */ 1138 PetscCall(PetscNew(&upTriFactorT)); 1139 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1140 1141 /* set the matrix descriptors of the upper triangular factor */ 1142 matrixType = cusparseGetMatType(upTriFactor->descr); 1143 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1144 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1145 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1146 1147 /* Create the matrix description */ 1148 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1149 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1150 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1151 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1152 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1153 1154 /* set the operation */ 1155 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1156 1157 /* allocate GPU space for the CSC of the upper triangular factor*/ 1158 upTriFactorT->csrMat = new CsrMatrix; 1159 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1160 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1161 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1162 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1163 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1164 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1165 1166 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1167 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1168 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1169 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1170 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1171 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1172 #endif 1173 1174 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1175 { 1176 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1177 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1178 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1180 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1181 #else 1182 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1183 #endif 1184 PetscCallCUSPARSE(stat); 1185 } 1186 1187 PetscCallCUDA(WaitForCUDA()); 1188 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1189 1190 /* Create the solve analysis information */ 1191 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1192 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1193 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1194 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1195 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1196 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1197 #endif 1198 1199 /* perform the solve analysis */ 1200 /* christ, would it have killed you to put this stuff in a function????????? */ 1201 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1202 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1203 1204 PetscCallCUDA(WaitForCUDA()); 1205 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1206 1207 /* assign the pointer */ 1208 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 #endif 1212 1213 struct PetscScalarToPetscInt { 1214 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1215 }; 1216 1217 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1218 { 1219 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1220 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1221 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1222 cusparseStatus_t stat; 1223 cusparseIndexBase_t indexBase; 1224 1225 PetscFunctionBegin; 1226 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1227 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1228 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1229 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1230 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1231 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1232 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1233 PetscCall(PetscLogGpuTimeBegin()); 1234 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1235 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1236 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1237 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1238 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1239 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1240 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1241 1242 /* set alpha and beta */ 1243 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1244 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1245 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1246 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1247 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1248 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1249 1250 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1251 CsrMatrix *matrixT = new CsrMatrix; 1252 matstructT->mat = matrixT; 1253 matrixT->num_rows = A->cmap->n; 1254 matrixT->num_cols = A->rmap->n; 1255 matrixT->num_entries = a->nz; 1256 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1257 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1258 matrixT->values = new THRUSTARRAY(a->nz); 1259 1260 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1261 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1262 1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1264 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1265 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1266 indexBase, cusparse_scalartype); 1267 PetscCallCUSPARSE(stat); 1268 #else 1269 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1270 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1271 1272 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1273 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1274 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1275 */ 1276 if (matrixT->num_entries) { 1277 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1278 PetscCallCUSPARSE(stat); 1279 1280 } else { 1281 matstructT->matDescr = NULL; 1282 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1283 } 1284 #endif 1285 #endif 1286 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1287 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1288 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1289 #else 1290 CsrMatrix *temp = new CsrMatrix; 1291 CsrMatrix *tempT = new CsrMatrix; 1292 /* First convert HYB to CSR */ 1293 temp->num_rows = A->rmap->n; 1294 temp->num_cols = A->cmap->n; 1295 temp->num_entries = a->nz; 1296 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1297 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1298 temp->values = new THRUSTARRAY(a->nz); 1299 1300 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1301 PetscCallCUSPARSE(stat); 1302 1303 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1304 tempT->num_rows = A->rmap->n; 1305 tempT->num_cols = A->cmap->n; 1306 tempT->num_entries = a->nz; 1307 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1308 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1309 tempT->values = new THRUSTARRAY(a->nz); 1310 1311 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1312 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1313 PetscCallCUSPARSE(stat); 1314 1315 /* Last, convert CSC to HYB */ 1316 cusparseHybMat_t hybMat; 1317 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1318 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1319 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1320 PetscCallCUSPARSE(stat); 1321 1322 /* assign the pointer */ 1323 matstructT->mat = hybMat; 1324 A->transupdated = PETSC_TRUE; 1325 /* delete temporaries */ 1326 if (tempT) { 1327 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1328 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1329 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1330 delete (CsrMatrix *)tempT; 1331 } 1332 if (temp) { 1333 if (temp->values) delete (THRUSTARRAY *)temp->values; 1334 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1335 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1336 delete (CsrMatrix *)temp; 1337 } 1338 #endif 1339 } 1340 } 1341 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1342 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1343 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1344 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1345 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1346 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1347 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1348 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1349 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1350 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1351 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1352 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1353 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1354 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1355 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1356 } 1357 if (!cusparsestruct->csr2csc_i) { 1358 THRUSTARRAY csr2csc_a(matrix->num_entries); 1359 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1360 1361 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1362 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1363 void *csr2cscBuffer; 1364 size_t csr2cscBufferSize; 1365 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1366 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1367 PetscCallCUSPARSE(stat); 1368 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1369 #endif 1370 1371 if (matrix->num_entries) { 1372 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1373 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1374 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1375 1376 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1377 should be filled with indexBase. So I just take a shortcut here. 1378 */ 1379 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1380 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1381 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1382 PetscCallCUSPARSE(stat); 1383 #else 1384 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1385 PetscCallCUSPARSE(stat); 1386 #endif 1387 } else { 1388 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1389 } 1390 1391 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1392 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1393 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1394 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1395 #endif 1396 } 1397 PetscCallThrust( 1398 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1399 } 1400 PetscCall(PetscLogGpuTimeEnd()); 1401 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1402 /* the compressed row indices is not used for matTranspose */ 1403 matstructT->cprowIndices = NULL; 1404 /* assign the pointer */ 1405 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1406 A->transupdated = PETSC_TRUE; 1407 PetscFunctionReturn(PETSC_SUCCESS); 1408 } 1409 1410 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1411 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1412 { 1413 const PetscScalar *barray; 1414 PetscScalar *xarray; 1415 thrust::device_ptr<const PetscScalar> bGPU; 1416 thrust::device_ptr<PetscScalar> xGPU; 1417 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1418 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1419 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1420 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1421 PetscInt m = A->rmap->n; 1422 1423 PetscFunctionBegin; 1424 PetscCall(PetscLogGpuTimeBegin()); 1425 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1426 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1427 xGPU = thrust::device_pointer_cast(xarray); 1428 bGPU = thrust::device_pointer_cast(barray); 1429 1430 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1431 if (fs->rpermIndices) { 1432 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1433 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1434 } else { 1435 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1436 } 1437 1438 // Solve L Y = X 1439 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1440 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1441 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1442 1443 // Solve U X = Y 1444 if (fs->cpermIndices) { 1445 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1446 } else { 1447 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1448 } 1449 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1450 1451 // Reorder X with the column permutation if needed, and put the result back to x 1452 if (fs->cpermIndices) { 1453 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1454 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1455 } 1456 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1457 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1458 PetscCall(PetscLogGpuTimeEnd()); 1459 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1460 PetscFunctionReturn(PETSC_SUCCESS); 1461 } 1462 1463 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1464 { 1465 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1466 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1467 const PetscScalar *barray; 1468 PetscScalar *xarray; 1469 thrust::device_ptr<const PetscScalar> bGPU; 1470 thrust::device_ptr<PetscScalar> xGPU; 1471 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1472 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1473 PetscInt m = A->rmap->n; 1474 1475 PetscFunctionBegin; 1476 PetscCall(PetscLogGpuTimeBegin()); 1477 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1478 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1479 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1480 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1481 1482 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1483 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1484 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1485 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1486 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1487 } 1488 1489 if (!fs->updatedTransposeSpSVAnalysis) { 1490 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1491 1492 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1493 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1494 } 1495 1496 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1497 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1498 xGPU = thrust::device_pointer_cast(xarray); 1499 bGPU = thrust::device_pointer_cast(barray); 1500 1501 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1502 if (fs->rpermIndices) { 1503 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1504 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1505 } else { 1506 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1507 } 1508 1509 // Solve Ut Y = X 1510 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1511 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1512 1513 // Solve Lt X = Y 1514 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1515 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1516 } else { 1517 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1518 } 1519 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1520 1521 // Reorder X with the column permutation if needed, and put the result back to x 1522 if (fs->cpermIndices) { 1523 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1524 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1525 } 1526 1527 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1528 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1529 PetscCall(PetscLogGpuTimeEnd()); 1530 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1531 PetscFunctionReturn(PETSC_SUCCESS); 1532 } 1533 #else 1534 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1535 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1536 { 1537 PetscInt n = xx->map->n; 1538 const PetscScalar *barray; 1539 PetscScalar *xarray; 1540 thrust::device_ptr<const PetscScalar> bGPU; 1541 thrust::device_ptr<PetscScalar> xGPU; 1542 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1543 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1544 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1545 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1546 1547 PetscFunctionBegin; 1548 /* Analyze the matrix and create the transpose ... on the fly */ 1549 if (!loTriFactorT && !upTriFactorT) { 1550 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1551 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1552 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1553 } 1554 1555 /* Get the GPU pointers */ 1556 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1557 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1558 xGPU = thrust::device_pointer_cast(xarray); 1559 bGPU = thrust::device_pointer_cast(barray); 1560 1561 PetscCall(PetscLogGpuTimeBegin()); 1562 /* First, reorder with the row permutation */ 1563 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1564 1565 /* First, solve U */ 1566 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1567 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1568 1569 /* Then, solve L */ 1570 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1571 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1572 1573 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1574 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1575 1576 /* Copy the temporary to the full solution. */ 1577 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1578 1579 /* restore */ 1580 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1581 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1582 PetscCall(PetscLogGpuTimeEnd()); 1583 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1584 PetscFunctionReturn(PETSC_SUCCESS); 1585 } 1586 1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1588 { 1589 const PetscScalar *barray; 1590 PetscScalar *xarray; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1595 1596 PetscFunctionBegin; 1597 /* Analyze the matrix and create the transpose ... on the fly */ 1598 if (!loTriFactorT && !upTriFactorT) { 1599 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1600 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1601 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1602 } 1603 1604 /* Get the GPU pointers */ 1605 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1606 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1607 1608 PetscCall(PetscLogGpuTimeBegin()); 1609 /* First, solve U */ 1610 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1611 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1612 1613 /* Then, solve L */ 1614 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1615 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1616 1617 /* restore */ 1618 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1619 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1620 PetscCall(PetscLogGpuTimeEnd()); 1621 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1622 PetscFunctionReturn(PETSC_SUCCESS); 1623 } 1624 1625 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1626 { 1627 const PetscScalar *barray; 1628 PetscScalar *xarray; 1629 thrust::device_ptr<const PetscScalar> bGPU; 1630 thrust::device_ptr<PetscScalar> xGPU; 1631 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1633 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1634 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1635 1636 PetscFunctionBegin; 1637 /* Get the GPU pointers */ 1638 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1639 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1640 xGPU = thrust::device_pointer_cast(xarray); 1641 bGPU = thrust::device_pointer_cast(barray); 1642 1643 PetscCall(PetscLogGpuTimeBegin()); 1644 /* First, reorder with the row permutation */ 1645 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1646 1647 /* Next, solve L */ 1648 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1649 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1650 1651 /* Then, solve U */ 1652 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1653 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1654 1655 /* Last, reorder with the column permutation */ 1656 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1657 1658 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1659 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1660 PetscCall(PetscLogGpuTimeEnd()); 1661 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1662 PetscFunctionReturn(PETSC_SUCCESS); 1663 } 1664 1665 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1666 { 1667 const PetscScalar *barray; 1668 PetscScalar *xarray; 1669 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1670 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1671 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1672 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1673 1674 PetscFunctionBegin; 1675 /* Get the GPU pointers */ 1676 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1677 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1678 1679 PetscCall(PetscLogGpuTimeBegin()); 1680 /* First, solve L */ 1681 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1682 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1683 1684 /* Next, solve U */ 1685 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1686 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1687 1688 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1689 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1690 PetscCall(PetscLogGpuTimeEnd()); 1691 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1692 PetscFunctionReturn(PETSC_SUCCESS); 1693 } 1694 #endif 1695 1696 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1697 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1698 { 1699 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1700 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1701 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1702 CsrMatrix *Acsr; 1703 PetscInt m, nz; 1704 PetscBool flg; 1705 1706 PetscFunctionBegin; 1707 if (PetscDefined(USE_DEBUG)) { 1708 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1709 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1710 } 1711 1712 /* Copy A's value to fact */ 1713 m = fact->rmap->n; 1714 nz = aij->nz; 1715 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1716 Acsr = (CsrMatrix *)Acusp->mat->mat; 1717 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1718 1719 PetscCall(PetscLogGpuTimeBegin()); 1720 /* Factorize fact inplace */ 1721 if (m) 1722 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1723 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1724 if (PetscDefined(USE_DEBUG)) { 1725 int numerical_zero; 1726 cusparseStatus_t status; 1727 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1728 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1729 } 1730 1731 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1732 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1733 */ 1734 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1735 1736 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1737 1738 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1739 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1740 1741 fact->offloadmask = PETSC_OFFLOAD_GPU; 1742 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1743 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1744 fact->ops->matsolve = NULL; 1745 fact->ops->matsolvetranspose = NULL; 1746 PetscCall(PetscLogGpuTimeEnd()); 1747 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1748 PetscFunctionReturn(PETSC_SUCCESS); 1749 } 1750 1751 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1752 { 1753 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1754 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1755 PetscInt m, nz; 1756 1757 PetscFunctionBegin; 1758 if (PetscDefined(USE_DEBUG)) { 1759 PetscInt i; 1760 PetscBool flg, missing; 1761 1762 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1763 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1764 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1765 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1766 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1767 } 1768 1769 /* Free the old stale stuff */ 1770 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1771 1772 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1773 but they will not be used. Allocate them just for easy debugging. 1774 */ 1775 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1776 1777 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1778 fact->factortype = MAT_FACTOR_ILU; 1779 fact->info.factor_mallocs = 0; 1780 fact->info.fill_ratio_given = info->fill; 1781 fact->info.fill_ratio_needed = 1.0; 1782 1783 aij->row = NULL; 1784 aij->col = NULL; 1785 1786 /* ====================================================================== */ 1787 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1788 /* We'll do in-place factorization on fact */ 1789 /* ====================================================================== */ 1790 const int *Ai, *Aj; 1791 1792 m = fact->rmap->n; 1793 nz = aij->nz; 1794 1795 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1796 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1797 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1798 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1799 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1800 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1801 1802 /* ====================================================================== */ 1803 /* Create descriptors for M, L, U */ 1804 /* ====================================================================== */ 1805 cusparseFillMode_t fillMode; 1806 cusparseDiagType_t diagType; 1807 1808 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1809 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1810 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1811 1812 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1813 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1814 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1815 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1816 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1817 */ 1818 fillMode = CUSPARSE_FILL_MODE_LOWER; 1819 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1820 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1821 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1822 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1823 1824 fillMode = CUSPARSE_FILL_MODE_UPPER; 1825 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1826 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1827 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1828 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1829 1830 /* ========================================================================= */ 1831 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1832 /* ========================================================================= */ 1833 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1834 if (m) 1835 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1836 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1837 1838 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1839 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1840 1841 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1842 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1843 1844 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1845 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1846 1847 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1848 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1849 1850 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1851 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1852 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1853 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1854 */ 1855 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1856 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1857 fs->spsvBuffer_L = fs->factBuffer_M; 1858 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1859 } else { 1860 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1861 fs->spsvBuffer_U = fs->factBuffer_M; 1862 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1863 } 1864 1865 /* ========================================================================== */ 1866 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1867 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1868 /* ========================================================================== */ 1869 int structural_zero; 1870 cusparseStatus_t status; 1871 1872 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1873 if (m) 1874 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1875 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1876 if (PetscDefined(USE_DEBUG)) { 1877 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1878 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1879 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1880 } 1881 1882 /* Estimate FLOPs of the numeric factorization */ 1883 { 1884 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1885 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1886 PetscLogDouble flops = 0.0; 1887 1888 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1889 Ai = Aseq->i; 1890 Adiag = Aseq->diag; 1891 for (PetscInt i = 0; i < m; i++) { 1892 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1893 nzRow = Ai[i + 1] - Ai[i]; 1894 nzLeft = Adiag[i] - Ai[i]; 1895 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1896 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1897 */ 1898 nzLeft = (nzRow - 1) / 2; 1899 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1900 } 1901 } 1902 fs->numericFactFlops = flops; 1903 } 1904 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1905 PetscFunctionReturn(PETSC_SUCCESS); 1906 } 1907 1908 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1909 { 1910 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1911 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1912 const PetscScalar *barray; 1913 PetscScalar *xarray; 1914 1915 PetscFunctionBegin; 1916 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1917 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1918 PetscCall(PetscLogGpuTimeBegin()); 1919 1920 /* Solve L*y = b */ 1921 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1922 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1923 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1924 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1925 1926 /* Solve Lt*x = y */ 1927 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1928 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1929 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1930 1931 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1932 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1933 1934 PetscCall(PetscLogGpuTimeEnd()); 1935 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1936 PetscFunctionReturn(PETSC_SUCCESS); 1937 } 1938 1939 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1940 { 1941 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1942 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1943 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1944 CsrMatrix *Acsr; 1945 PetscInt m, nz; 1946 PetscBool flg; 1947 1948 PetscFunctionBegin; 1949 if (PetscDefined(USE_DEBUG)) { 1950 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1951 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1952 } 1953 1954 /* Copy A's value to fact */ 1955 m = fact->rmap->n; 1956 nz = aij->nz; 1957 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1958 Acsr = (CsrMatrix *)Acusp->mat->mat; 1959 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1960 1961 /* Factorize fact inplace */ 1962 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1963 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1964 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1965 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1966 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1967 */ 1968 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1969 if (PetscDefined(USE_DEBUG)) { 1970 int numerical_zero; 1971 cusparseStatus_t status; 1972 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1973 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1974 } 1975 1976 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1977 1978 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1979 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1980 */ 1981 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1982 1983 fact->offloadmask = PETSC_OFFLOAD_GPU; 1984 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1985 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1986 fact->ops->matsolve = NULL; 1987 fact->ops->matsolvetranspose = NULL; 1988 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1989 PetscFunctionReturn(PETSC_SUCCESS); 1990 } 1991 1992 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1993 { 1994 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1995 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1996 PetscInt m, nz; 1997 1998 PetscFunctionBegin; 1999 if (PetscDefined(USE_DEBUG)) { 2000 PetscInt i; 2001 PetscBool flg, missing; 2002 2003 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2004 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2005 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2006 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2007 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2008 } 2009 2010 /* Free the old stale stuff */ 2011 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2012 2013 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2014 but they will not be used. Allocate them just for easy debugging. 2015 */ 2016 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2017 2018 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2019 fact->factortype = MAT_FACTOR_ICC; 2020 fact->info.factor_mallocs = 0; 2021 fact->info.fill_ratio_given = info->fill; 2022 fact->info.fill_ratio_needed = 1.0; 2023 2024 aij->row = NULL; 2025 aij->col = NULL; 2026 2027 /* ====================================================================== */ 2028 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2029 /* We'll do in-place factorization on fact */ 2030 /* ====================================================================== */ 2031 const int *Ai, *Aj; 2032 2033 m = fact->rmap->n; 2034 nz = aij->nz; 2035 2036 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2037 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2038 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2039 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2040 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2041 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2042 2043 /* ====================================================================== */ 2044 /* Create mat descriptors for M, L */ 2045 /* ====================================================================== */ 2046 cusparseFillMode_t fillMode; 2047 cusparseDiagType_t diagType; 2048 2049 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2050 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2051 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2052 2053 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2054 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2055 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2056 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2057 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2058 */ 2059 fillMode = CUSPARSE_FILL_MODE_LOWER; 2060 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2061 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2062 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2063 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2064 2065 /* ========================================================================= */ 2066 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2067 /* ========================================================================= */ 2068 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2069 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2070 2071 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2072 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2073 2074 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2075 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2076 2077 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2078 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2079 2080 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2081 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2082 2083 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2084 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2085 */ 2086 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2087 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2088 fs->spsvBuffer_L = fs->factBuffer_M; 2089 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2090 } else { 2091 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2092 fs->spsvBuffer_Lt = fs->factBuffer_M; 2093 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2094 } 2095 2096 /* ========================================================================== */ 2097 /* Perform analysis of ic0 on M */ 2098 /* The lower triangular part of M has the same sparsity pattern as L */ 2099 /* ========================================================================== */ 2100 int structural_zero; 2101 cusparseStatus_t status; 2102 2103 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2104 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2105 if (PetscDefined(USE_DEBUG)) { 2106 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2107 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2108 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2109 } 2110 2111 /* Estimate FLOPs of the numeric factorization */ 2112 { 2113 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2114 PetscInt *Ai, nzRow, nzLeft; 2115 PetscLogDouble flops = 0.0; 2116 2117 Ai = Aseq->i; 2118 for (PetscInt i = 0; i < m; i++) { 2119 nzRow = Ai[i + 1] - Ai[i]; 2120 if (nzRow > 1) { 2121 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2122 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2123 */ 2124 nzLeft = (nzRow - 1) / 2; 2125 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2126 } 2127 } 2128 fs->numericFactFlops = flops; 2129 } 2130 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2131 PetscFunctionReturn(PETSC_SUCCESS); 2132 } 2133 #endif 2134 2135 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2136 { 2137 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2138 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2139 2140 PetscFunctionBegin; 2141 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2142 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2143 B->offloadmask = PETSC_OFFLOAD_CPU; 2144 2145 if (!cusparsestruct->use_cpu_solve) { 2146 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2147 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2148 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2149 #else 2150 /* determine which version of MatSolve needs to be used. */ 2151 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2152 IS isrow = b->row, iscol = b->col; 2153 PetscBool row_identity, col_identity; 2154 2155 PetscCall(ISIdentity(isrow, &row_identity)); 2156 PetscCall(ISIdentity(iscol, &col_identity)); 2157 if (row_identity && col_identity) { 2158 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2159 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2160 } else { 2161 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2162 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2163 } 2164 #endif 2165 } 2166 B->ops->matsolve = NULL; 2167 B->ops->matsolvetranspose = NULL; 2168 2169 /* get the triangular factors */ 2170 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2171 PetscFunctionReturn(PETSC_SUCCESS); 2172 } 2173 2174 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2175 { 2176 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2177 2178 PetscFunctionBegin; 2179 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2180 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2181 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2182 PetscFunctionReturn(PETSC_SUCCESS); 2183 } 2184 2185 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2186 { 2187 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2188 2189 PetscFunctionBegin; 2190 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2191 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2192 if (cusparseTriFactors->factorizeOnDevice) { 2193 PetscCall(ISIdentity(isrow, &row_identity)); 2194 PetscCall(ISIdentity(iscol, &col_identity)); 2195 } 2196 if (!info->levels && row_identity && col_identity) { 2197 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2198 } else 2199 #endif 2200 { 2201 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2202 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2203 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2204 } 2205 PetscFunctionReturn(PETSC_SUCCESS); 2206 } 2207 2208 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2209 { 2210 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2211 2212 PetscFunctionBegin; 2213 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2214 PetscBool perm_identity = PETSC_FALSE; 2215 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2216 if (!info->levels && perm_identity) { 2217 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2218 } else 2219 #endif 2220 { 2221 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2222 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2223 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2224 } 2225 PetscFunctionReturn(PETSC_SUCCESS); 2226 } 2227 2228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2229 { 2230 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2231 2232 PetscFunctionBegin; 2233 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2234 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2235 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2236 PetscFunctionReturn(PETSC_SUCCESS); 2237 } 2238 2239 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2240 { 2241 PetscFunctionBegin; 2242 *type = MATSOLVERCUSPARSE; 2243 PetscFunctionReturn(PETSC_SUCCESS); 2244 } 2245 2246 /*MC 2247 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2248 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2249 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2250 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2251 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2252 algorithms are not recommended. This class does NOT support direct solver operations. 2253 2254 Level: beginner 2255 2256 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2257 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2258 M*/ 2259 2260 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2261 { 2262 PetscInt n = A->rmap->n; 2263 PetscBool factOnDevice, factOnHost; 2264 char *prefix; 2265 char factPlace[32] = "device"; /* the default */ 2266 2267 PetscFunctionBegin; 2268 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2269 PetscCall(MatSetSizes(*B, n, n, n, n)); 2270 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2271 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2272 2273 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2274 PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat"); 2275 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2276 PetscOptionsEnd(); 2277 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2278 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2279 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2280 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2281 2282 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2283 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2284 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2285 if (!A->boundtocpu) { 2286 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2287 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2288 } else { 2289 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2290 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2291 } 2292 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2293 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2294 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2295 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2296 if (!A->boundtocpu) { 2297 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2298 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2299 } else { 2300 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2301 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2302 } 2303 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2304 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2305 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2306 2307 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2308 (*B)->canuseordering = PETSC_TRUE; 2309 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2310 PetscFunctionReturn(PETSC_SUCCESS); 2311 } 2312 2313 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2314 { 2315 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2316 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2317 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2318 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2319 #endif 2320 2321 PetscFunctionBegin; 2322 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2323 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2324 if (A->factortype == MAT_FACTOR_NONE) { 2325 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2326 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2327 } 2328 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2329 else if (fs->csrVal) { 2330 /* We have a factorized matrix on device and are able to copy it to host */ 2331 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2332 } 2333 #endif 2334 else 2335 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2336 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2337 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2338 A->offloadmask = PETSC_OFFLOAD_BOTH; 2339 } 2340 PetscFunctionReturn(PETSC_SUCCESS); 2341 } 2342 2343 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2344 { 2345 PetscFunctionBegin; 2346 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2347 *array = ((Mat_SeqAIJ *)A->data)->a; 2348 PetscFunctionReturn(PETSC_SUCCESS); 2349 } 2350 2351 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2352 { 2353 PetscFunctionBegin; 2354 A->offloadmask = PETSC_OFFLOAD_CPU; 2355 *array = NULL; 2356 PetscFunctionReturn(PETSC_SUCCESS); 2357 } 2358 2359 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2360 { 2361 PetscFunctionBegin; 2362 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2363 *array = ((Mat_SeqAIJ *)A->data)->a; 2364 PetscFunctionReturn(PETSC_SUCCESS); 2365 } 2366 2367 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2368 { 2369 PetscFunctionBegin; 2370 *array = NULL; 2371 PetscFunctionReturn(PETSC_SUCCESS); 2372 } 2373 2374 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2375 { 2376 PetscFunctionBegin; 2377 *array = ((Mat_SeqAIJ *)A->data)->a; 2378 PetscFunctionReturn(PETSC_SUCCESS); 2379 } 2380 2381 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2382 { 2383 PetscFunctionBegin; 2384 A->offloadmask = PETSC_OFFLOAD_CPU; 2385 *array = NULL; 2386 PetscFunctionReturn(PETSC_SUCCESS); 2387 } 2388 2389 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2390 { 2391 Mat_SeqAIJCUSPARSE *cusp; 2392 CsrMatrix *matrix; 2393 2394 PetscFunctionBegin; 2395 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2396 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2397 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2398 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2399 matrix = (CsrMatrix *)cusp->mat->mat; 2400 2401 if (i) { 2402 #if !defined(PETSC_USE_64BIT_INDICES) 2403 *i = matrix->row_offsets->data().get(); 2404 #else 2405 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2406 #endif 2407 } 2408 if (j) { 2409 #if !defined(PETSC_USE_64BIT_INDICES) 2410 *j = matrix->column_indices->data().get(); 2411 #else 2412 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2413 #endif 2414 } 2415 if (a) *a = matrix->values->data().get(); 2416 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2417 PetscFunctionReturn(PETSC_SUCCESS); 2418 } 2419 2420 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2421 { 2422 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2423 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2424 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2425 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2426 cusparseStatus_t stat; 2427 PetscBool both = PETSC_TRUE; 2428 2429 PetscFunctionBegin; 2430 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2431 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2432 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2433 CsrMatrix *matrix; 2434 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2435 2436 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2437 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2438 matrix->values->assign(a->a, a->a + a->nz); 2439 PetscCallCUDA(WaitForCUDA()); 2440 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2441 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2442 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2443 } else { 2444 PetscInt nnz; 2445 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2446 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2447 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2448 delete cusparsestruct->workVector; 2449 delete cusparsestruct->rowoffsets_gpu; 2450 cusparsestruct->workVector = NULL; 2451 cusparsestruct->rowoffsets_gpu = NULL; 2452 try { 2453 if (a->compressedrow.use) { 2454 m = a->compressedrow.nrows; 2455 ii = a->compressedrow.i; 2456 ridx = a->compressedrow.rindex; 2457 } else { 2458 m = A->rmap->n; 2459 ii = a->i; 2460 ridx = NULL; 2461 } 2462 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2463 if (!a->a) { 2464 nnz = ii[m]; 2465 both = PETSC_FALSE; 2466 } else nnz = a->nz; 2467 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2468 2469 /* create cusparse matrix */ 2470 cusparsestruct->nrows = m; 2471 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2472 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2473 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2474 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2475 2476 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2477 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2478 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2479 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2480 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2481 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2482 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2483 2484 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2485 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2486 /* set the matrix */ 2487 CsrMatrix *mat = new CsrMatrix; 2488 mat->num_rows = m; 2489 mat->num_cols = A->cmap->n; 2490 mat->num_entries = nnz; 2491 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2492 mat->row_offsets->assign(ii, ii + m + 1); 2493 2494 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2495 mat->column_indices->assign(a->j, a->j + nnz); 2496 2497 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2498 if (a->a) mat->values->assign(a->a, a->a + nnz); 2499 2500 /* assign the pointer */ 2501 matstruct->mat = mat; 2502 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2503 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2504 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2505 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2506 PetscCallCUSPARSE(stat); 2507 } 2508 #endif 2509 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2510 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2511 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2512 #else 2513 CsrMatrix *mat = new CsrMatrix; 2514 mat->num_rows = m; 2515 mat->num_cols = A->cmap->n; 2516 mat->num_entries = nnz; 2517 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2518 mat->row_offsets->assign(ii, ii + m + 1); 2519 2520 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2521 mat->column_indices->assign(a->j, a->j + nnz); 2522 2523 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2524 if (a->a) mat->values->assign(a->a, a->a + nnz); 2525 2526 cusparseHybMat_t hybMat; 2527 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2528 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2529 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2530 PetscCallCUSPARSE(stat); 2531 /* assign the pointer */ 2532 matstruct->mat = hybMat; 2533 2534 if (mat) { 2535 if (mat->values) delete (THRUSTARRAY *)mat->values; 2536 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2537 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2538 delete (CsrMatrix *)mat; 2539 } 2540 #endif 2541 } 2542 2543 /* assign the compressed row indices */ 2544 if (a->compressedrow.use) { 2545 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2546 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2547 matstruct->cprowIndices->assign(ridx, ridx + m); 2548 tmp = m; 2549 } else { 2550 cusparsestruct->workVector = NULL; 2551 matstruct->cprowIndices = NULL; 2552 tmp = 0; 2553 } 2554 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2555 2556 /* assign the pointer */ 2557 cusparsestruct->mat = matstruct; 2558 } catch (char *ex) { 2559 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2560 } 2561 PetscCallCUDA(WaitForCUDA()); 2562 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2563 cusparsestruct->nonzerostate = A->nonzerostate; 2564 } 2565 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2566 } 2567 PetscFunctionReturn(PETSC_SUCCESS); 2568 } 2569 2570 struct VecCUDAPlusEquals { 2571 template <typename Tuple> 2572 __host__ __device__ void operator()(Tuple t) 2573 { 2574 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2575 } 2576 }; 2577 2578 struct VecCUDAEquals { 2579 template <typename Tuple> 2580 __host__ __device__ void operator()(Tuple t) 2581 { 2582 thrust::get<1>(t) = thrust::get<0>(t); 2583 } 2584 }; 2585 2586 struct VecCUDAEqualsReverse { 2587 template <typename Tuple> 2588 __host__ __device__ void operator()(Tuple t) 2589 { 2590 thrust::get<0>(t) = thrust::get<1>(t); 2591 } 2592 }; 2593 2594 struct MatMatCusparse { 2595 PetscBool cisdense; 2596 PetscScalar *Bt; 2597 Mat X; 2598 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2599 PetscLogDouble flops; 2600 CsrMatrix *Bcsr; 2601 2602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2603 cusparseSpMatDescr_t matSpBDescr; 2604 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2605 cusparseDnMatDescr_t matBDescr; 2606 cusparseDnMatDescr_t matCDescr; 2607 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2608 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2609 void *dBuffer4; 2610 void *dBuffer5; 2611 #endif 2612 size_t mmBufferSize; 2613 void *mmBuffer; 2614 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2615 cusparseSpGEMMDescr_t spgemmDesc; 2616 #endif 2617 }; 2618 2619 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2620 { 2621 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2622 2623 PetscFunctionBegin; 2624 PetscCallCUDA(cudaFree(mmdata->Bt)); 2625 delete mmdata->Bcsr; 2626 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2627 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2628 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2629 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2630 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2632 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2633 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2634 #endif 2635 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2636 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2637 #endif 2638 PetscCall(MatDestroy(&mmdata->X)); 2639 PetscCall(PetscFree(data)); 2640 PetscFunctionReturn(PETSC_SUCCESS); 2641 } 2642 2643 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2644 2645 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2646 { 2647 Mat_Product *product = C->product; 2648 Mat A, B; 2649 PetscInt m, n, blda, clda; 2650 PetscBool flg, biscuda; 2651 Mat_SeqAIJCUSPARSE *cusp; 2652 cusparseStatus_t stat; 2653 cusparseOperation_t opA; 2654 const PetscScalar *barray; 2655 PetscScalar *carray; 2656 MatMatCusparse *mmdata; 2657 Mat_SeqAIJCUSPARSEMultStruct *mat; 2658 CsrMatrix *csrmat; 2659 2660 PetscFunctionBegin; 2661 MatCheckProduct(C, 1); 2662 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2663 mmdata = (MatMatCusparse *)product->data; 2664 A = product->A; 2665 B = product->B; 2666 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2667 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2668 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2669 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2670 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2671 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2672 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2673 switch (product->type) { 2674 case MATPRODUCT_AB: 2675 case MATPRODUCT_PtAP: 2676 mat = cusp->mat; 2677 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2678 m = A->rmap->n; 2679 n = B->cmap->n; 2680 break; 2681 case MATPRODUCT_AtB: 2682 if (!A->form_explicit_transpose) { 2683 mat = cusp->mat; 2684 opA = CUSPARSE_OPERATION_TRANSPOSE; 2685 } else { 2686 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2687 mat = cusp->matTranspose; 2688 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2689 } 2690 m = A->cmap->n; 2691 n = B->cmap->n; 2692 break; 2693 case MATPRODUCT_ABt: 2694 case MATPRODUCT_RARt: 2695 mat = cusp->mat; 2696 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2697 m = A->rmap->n; 2698 n = B->rmap->n; 2699 break; 2700 default: 2701 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2702 } 2703 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2704 csrmat = (CsrMatrix *)mat->mat; 2705 /* if the user passed a CPU matrix, copy the data to the GPU */ 2706 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2707 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2708 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2709 2710 PetscCall(MatDenseGetLDA(B, &blda)); 2711 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2712 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2713 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2714 } else { 2715 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2716 PetscCall(MatDenseGetLDA(C, &clda)); 2717 } 2718 2719 PetscCall(PetscLogGpuTimeBegin()); 2720 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2721 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2722 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2723 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2724 size_t mmBufferSize; 2725 if (mmdata->initialized && mmdata->Blda != blda) { 2726 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2727 mmdata->matBDescr = NULL; 2728 } 2729 if (!mmdata->matBDescr) { 2730 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2731 mmdata->Blda = blda; 2732 } 2733 2734 if (mmdata->initialized && mmdata->Clda != clda) { 2735 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2736 mmdata->matCDescr = NULL; 2737 } 2738 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2739 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2740 mmdata->Clda = clda; 2741 } 2742 2743 if (!mat->matDescr) { 2744 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2745 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2746 PetscCallCUSPARSE(stat); 2747 } 2748 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2749 PetscCallCUSPARSE(stat); 2750 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2751 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2752 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2753 mmdata->mmBufferSize = mmBufferSize; 2754 } 2755 mmdata->initialized = PETSC_TRUE; 2756 } else { 2757 /* to be safe, always update pointers of the mats */ 2758 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2759 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2760 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2761 } 2762 2763 /* do cusparseSpMM, which supports transpose on B */ 2764 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2765 PetscCallCUSPARSE(stat); 2766 #else 2767 PetscInt k; 2768 /* cusparseXcsrmm does not support transpose on B */ 2769 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2770 cublasHandle_t cublasv2handle; 2771 cublasStatus_t cerr; 2772 2773 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2774 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2775 PetscCallCUBLAS(cerr); 2776 blda = B->cmap->n; 2777 k = B->cmap->n; 2778 } else { 2779 k = B->rmap->n; 2780 } 2781 2782 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2783 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2784 PetscCallCUSPARSE(stat); 2785 #endif 2786 PetscCall(PetscLogGpuTimeEnd()); 2787 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2788 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2789 if (product->type == MATPRODUCT_RARt) { 2790 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2791 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2792 } else if (product->type == MATPRODUCT_PtAP) { 2793 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2794 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2795 } else { 2796 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2797 } 2798 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2799 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2800 PetscFunctionReturn(PETSC_SUCCESS); 2801 } 2802 2803 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2804 { 2805 Mat_Product *product = C->product; 2806 Mat A, B; 2807 PetscInt m, n; 2808 PetscBool cisdense, flg; 2809 MatMatCusparse *mmdata; 2810 Mat_SeqAIJCUSPARSE *cusp; 2811 2812 PetscFunctionBegin; 2813 MatCheckProduct(C, 1); 2814 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2815 A = product->A; 2816 B = product->B; 2817 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2818 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2819 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2820 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2821 switch (product->type) { 2822 case MATPRODUCT_AB: 2823 m = A->rmap->n; 2824 n = B->cmap->n; 2825 break; 2826 case MATPRODUCT_AtB: 2827 m = A->cmap->n; 2828 n = B->cmap->n; 2829 break; 2830 case MATPRODUCT_ABt: 2831 m = A->rmap->n; 2832 n = B->rmap->n; 2833 break; 2834 case MATPRODUCT_PtAP: 2835 m = B->cmap->n; 2836 n = B->cmap->n; 2837 break; 2838 case MATPRODUCT_RARt: 2839 m = B->rmap->n; 2840 n = B->rmap->n; 2841 break; 2842 default: 2843 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2844 } 2845 PetscCall(MatSetSizes(C, m, n, m, n)); 2846 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2847 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2848 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2849 2850 /* product data */ 2851 PetscCall(PetscNew(&mmdata)); 2852 mmdata->cisdense = cisdense; 2853 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2854 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2855 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2856 #endif 2857 /* for these products we need intermediate storage */ 2858 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2859 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2860 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2861 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2862 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2863 } else { 2864 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2865 } 2866 } 2867 C->product->data = mmdata; 2868 C->product->destroy = MatDestroy_MatMatCusparse; 2869 2870 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2871 PetscFunctionReturn(PETSC_SUCCESS); 2872 } 2873 2874 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2875 { 2876 Mat_Product *product = C->product; 2877 Mat A, B; 2878 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2879 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2880 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2881 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2882 PetscBool flg; 2883 cusparseStatus_t stat; 2884 MatProductType ptype; 2885 MatMatCusparse *mmdata; 2886 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2887 cusparseSpMatDescr_t BmatSpDescr; 2888 #endif 2889 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2890 2891 PetscFunctionBegin; 2892 MatCheckProduct(C, 1); 2893 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2894 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2895 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2896 mmdata = (MatMatCusparse *)C->product->data; 2897 A = product->A; 2898 B = product->B; 2899 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2900 mmdata->reusesym = PETSC_FALSE; 2901 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2902 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2903 Cmat = Ccusp->mat; 2904 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2905 Ccsr = (CsrMatrix *)Cmat->mat; 2906 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2907 goto finalize; 2908 } 2909 if (!c->nz) goto finalize; 2910 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2911 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2912 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2913 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2914 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2915 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2916 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2917 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2918 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2919 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2920 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2921 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2922 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2923 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2924 2925 ptype = product->type; 2926 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2927 ptype = MATPRODUCT_AB; 2928 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2929 } 2930 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2931 ptype = MATPRODUCT_AB; 2932 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2933 } 2934 switch (ptype) { 2935 case MATPRODUCT_AB: 2936 Amat = Acusp->mat; 2937 Bmat = Bcusp->mat; 2938 break; 2939 case MATPRODUCT_AtB: 2940 Amat = Acusp->matTranspose; 2941 Bmat = Bcusp->mat; 2942 break; 2943 case MATPRODUCT_ABt: 2944 Amat = Acusp->mat; 2945 Bmat = Bcusp->matTranspose; 2946 break; 2947 default: 2948 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2949 } 2950 Cmat = Ccusp->mat; 2951 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2952 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2953 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2954 Acsr = (CsrMatrix *)Amat->mat; 2955 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2956 Ccsr = (CsrMatrix *)Cmat->mat; 2957 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2958 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2959 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2960 PetscCall(PetscLogGpuTimeBegin()); 2961 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2962 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2963 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2964 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2965 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2966 PetscCallCUSPARSE(stat); 2967 #else 2968 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2969 PetscCallCUSPARSE(stat); 2970 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2971 PetscCallCUSPARSE(stat); 2972 #endif 2973 #else 2974 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2975 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2976 PetscCallCUSPARSE(stat); 2977 #endif 2978 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2979 PetscCallCUDA(WaitForCUDA()); 2980 PetscCall(PetscLogGpuTimeEnd()); 2981 C->offloadmask = PETSC_OFFLOAD_GPU; 2982 finalize: 2983 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2984 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2985 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2986 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2987 c->reallocs = 0; 2988 C->info.mallocs += 0; 2989 C->info.nz_unneeded = 0; 2990 C->assembled = C->was_assembled = PETSC_TRUE; 2991 C->num_ass++; 2992 PetscFunctionReturn(PETSC_SUCCESS); 2993 } 2994 2995 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2996 { 2997 Mat_Product *product = C->product; 2998 Mat A, B; 2999 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3000 Mat_SeqAIJ *a, *b, *c; 3001 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3002 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3003 PetscInt i, j, m, n, k; 3004 PetscBool flg; 3005 cusparseStatus_t stat; 3006 MatProductType ptype; 3007 MatMatCusparse *mmdata; 3008 PetscLogDouble flops; 3009 PetscBool biscompressed, ciscompressed; 3010 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3011 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3012 cusparseSpMatDescr_t BmatSpDescr; 3013 #else 3014 int cnz; 3015 #endif 3016 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3017 3018 PetscFunctionBegin; 3019 MatCheckProduct(C, 1); 3020 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3021 A = product->A; 3022 B = product->B; 3023 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3024 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3025 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3026 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3027 a = (Mat_SeqAIJ *)A->data; 3028 b = (Mat_SeqAIJ *)B->data; 3029 /* product data */ 3030 PetscCall(PetscNew(&mmdata)); 3031 C->product->data = mmdata; 3032 C->product->destroy = MatDestroy_MatMatCusparse; 3033 3034 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3035 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3036 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3037 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3038 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3039 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3040 3041 ptype = product->type; 3042 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3043 ptype = MATPRODUCT_AB; 3044 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3045 } 3046 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3047 ptype = MATPRODUCT_AB; 3048 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3049 } 3050 biscompressed = PETSC_FALSE; 3051 ciscompressed = PETSC_FALSE; 3052 switch (ptype) { 3053 case MATPRODUCT_AB: 3054 m = A->rmap->n; 3055 n = B->cmap->n; 3056 k = A->cmap->n; 3057 Amat = Acusp->mat; 3058 Bmat = Bcusp->mat; 3059 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3060 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3061 break; 3062 case MATPRODUCT_AtB: 3063 m = A->cmap->n; 3064 n = B->cmap->n; 3065 k = A->rmap->n; 3066 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3067 Amat = Acusp->matTranspose; 3068 Bmat = Bcusp->mat; 3069 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3070 break; 3071 case MATPRODUCT_ABt: 3072 m = A->rmap->n; 3073 n = B->rmap->n; 3074 k = A->cmap->n; 3075 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3076 Amat = Acusp->mat; 3077 Bmat = Bcusp->matTranspose; 3078 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3079 break; 3080 default: 3081 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3082 } 3083 3084 /* create cusparse matrix */ 3085 PetscCall(MatSetSizes(C, m, n, m, n)); 3086 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3087 c = (Mat_SeqAIJ *)C->data; 3088 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3089 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3090 Ccsr = new CsrMatrix; 3091 3092 c->compressedrow.use = ciscompressed; 3093 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3094 c->compressedrow.nrows = a->compressedrow.nrows; 3095 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3096 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3097 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3098 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3099 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3100 } else { 3101 c->compressedrow.nrows = 0; 3102 c->compressedrow.i = NULL; 3103 c->compressedrow.rindex = NULL; 3104 Ccusp->workVector = NULL; 3105 Cmat->cprowIndices = NULL; 3106 } 3107 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3108 Ccusp->mat = Cmat; 3109 Ccusp->mat->mat = Ccsr; 3110 Ccsr->num_rows = Ccusp->nrows; 3111 Ccsr->num_cols = n; 3112 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3113 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3114 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3115 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3116 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3117 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3118 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3119 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3120 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3121 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3122 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3123 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3124 c->nz = 0; 3125 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3126 Ccsr->values = new THRUSTARRAY(c->nz); 3127 goto finalizesym; 3128 } 3129 3130 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3131 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3132 Acsr = (CsrMatrix *)Amat->mat; 3133 if (!biscompressed) { 3134 Bcsr = (CsrMatrix *)Bmat->mat; 3135 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3136 BmatSpDescr = Bmat->matDescr; 3137 #endif 3138 } else { /* we need to use row offsets for the full matrix */ 3139 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3140 Bcsr = new CsrMatrix; 3141 Bcsr->num_rows = B->rmap->n; 3142 Bcsr->num_cols = cBcsr->num_cols; 3143 Bcsr->num_entries = cBcsr->num_entries; 3144 Bcsr->column_indices = cBcsr->column_indices; 3145 Bcsr->values = cBcsr->values; 3146 if (!Bcusp->rowoffsets_gpu) { 3147 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3148 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3149 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3150 } 3151 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3152 mmdata->Bcsr = Bcsr; 3153 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3154 if (Bcsr->num_rows && Bcsr->num_cols) { 3155 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3156 PetscCallCUSPARSE(stat); 3157 } 3158 BmatSpDescr = mmdata->matSpBDescr; 3159 #endif 3160 } 3161 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3162 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3163 /* precompute flops count */ 3164 if (ptype == MATPRODUCT_AB) { 3165 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3166 const PetscInt st = a->i[i]; 3167 const PetscInt en = a->i[i + 1]; 3168 for (j = st; j < en; j++) { 3169 const PetscInt brow = a->j[j]; 3170 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3171 } 3172 } 3173 } else if (ptype == MATPRODUCT_AtB) { 3174 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3175 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3176 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3177 flops += (2. * anzi) * bnzi; 3178 } 3179 } else { /* TODO */ 3180 flops = 0.; 3181 } 3182 3183 mmdata->flops = flops; 3184 PetscCall(PetscLogGpuTimeBegin()); 3185 3186 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3187 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3188 // cuda-12.2 requires non-null csrRowOffsets 3189 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3190 PetscCallCUSPARSE(stat); 3191 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3192 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3193 { 3194 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3195 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3196 */ 3197 void *dBuffer1 = NULL; 3198 void *dBuffer2 = NULL; 3199 void *dBuffer3 = NULL; 3200 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3201 size_t bufferSize1 = 0; 3202 size_t bufferSize2 = 0; 3203 size_t bufferSize3 = 0; 3204 size_t bufferSize4 = 0; 3205 size_t bufferSize5 = 0; 3206 3207 /* ask bufferSize1 bytes for external memory */ 3208 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3209 PetscCallCUSPARSE(stat); 3210 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3211 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3212 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3213 PetscCallCUSPARSE(stat); 3214 3215 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3216 PetscCallCUSPARSE(stat); 3217 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3218 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3219 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3220 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3221 PetscCallCUSPARSE(stat); 3222 PetscCallCUDA(cudaFree(dBuffer1)); 3223 PetscCallCUDA(cudaFree(dBuffer2)); 3224 3225 /* get matrix C non-zero entries C_nnz1 */ 3226 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3227 c->nz = (PetscInt)C_nnz1; 3228 /* allocate matrix C */ 3229 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3230 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3231 Ccsr->values = new THRUSTARRAY(c->nz); 3232 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3233 /* update matC with the new pointers */ 3234 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3235 PetscCallCUSPARSE(stat); 3236 3237 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3238 PetscCallCUSPARSE(stat); 3239 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3240 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3241 PetscCallCUSPARSE(stat); 3242 PetscCallCUDA(cudaFree(dBuffer3)); 3243 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3244 PetscCallCUSPARSE(stat); 3245 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3246 } 3247 #else 3248 size_t bufSize2; 3249 /* ask bufferSize bytes for external memory */ 3250 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3251 PetscCallCUSPARSE(stat); 3252 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3253 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3254 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3255 PetscCallCUSPARSE(stat); 3256 /* ask bufferSize again bytes for external memory */ 3257 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3258 PetscCallCUSPARSE(stat); 3259 /* The CUSPARSE documentation is not clear, nor the API 3260 We need both buffers to perform the operations properly! 3261 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3262 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3263 is stored in the descriptor! What a messy API... */ 3264 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3265 /* compute the intermediate product of A * B */ 3266 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3267 PetscCallCUSPARSE(stat); 3268 /* get matrix C non-zero entries C_nnz1 */ 3269 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3270 c->nz = (PetscInt)C_nnz1; 3271 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3272 mmdata->mmBufferSize / 1024)); 3273 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3274 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3275 Ccsr->values = new THRUSTARRAY(c->nz); 3276 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3277 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3278 PetscCallCUSPARSE(stat); 3279 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3280 PetscCallCUSPARSE(stat); 3281 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3282 #else 3283 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3284 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3285 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3286 PetscCallCUSPARSE(stat); 3287 c->nz = cnz; 3288 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3289 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3290 Ccsr->values = new THRUSTARRAY(c->nz); 3291 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3292 3293 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3294 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3295 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3296 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3297 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3298 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3299 PetscCallCUSPARSE(stat); 3300 #endif 3301 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3302 PetscCall(PetscLogGpuTimeEnd()); 3303 finalizesym: 3304 c->free_a = PETSC_TRUE; 3305 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3306 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3307 c->free_ij = PETSC_TRUE; 3308 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3309 PetscInt *d_i = c->i; 3310 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3311 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3312 ii = *Ccsr->row_offsets; 3313 jj = *Ccsr->column_indices; 3314 if (ciscompressed) d_i = c->compressedrow.i; 3315 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3316 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3317 } else { 3318 PetscInt *d_i = c->i; 3319 if (ciscompressed) d_i = c->compressedrow.i; 3320 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3321 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3322 } 3323 if (ciscompressed) { /* need to expand host row offsets */ 3324 PetscInt r = 0; 3325 c->i[0] = 0; 3326 for (k = 0; k < c->compressedrow.nrows; k++) { 3327 const PetscInt next = c->compressedrow.rindex[k]; 3328 const PetscInt old = c->compressedrow.i[k]; 3329 for (; r < next; r++) c->i[r + 1] = old; 3330 } 3331 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3332 } 3333 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3334 PetscCall(PetscMalloc1(m, &c->ilen)); 3335 PetscCall(PetscMalloc1(m, &c->imax)); 3336 c->maxnz = c->nz; 3337 c->nonzerorowcnt = 0; 3338 c->rmax = 0; 3339 for (k = 0; k < m; k++) { 3340 const PetscInt nn = c->i[k + 1] - c->i[k]; 3341 c->ilen[k] = c->imax[k] = nn; 3342 c->nonzerorowcnt += (PetscInt) !!nn; 3343 c->rmax = PetscMax(c->rmax, nn); 3344 } 3345 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3346 PetscCall(PetscMalloc1(c->nz, &c->a)); 3347 Ccsr->num_entries = c->nz; 3348 3349 C->nonzerostate++; 3350 PetscCall(PetscLayoutSetUp(C->rmap)); 3351 PetscCall(PetscLayoutSetUp(C->cmap)); 3352 Ccusp->nonzerostate = C->nonzerostate; 3353 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3354 C->preallocated = PETSC_TRUE; 3355 C->assembled = PETSC_FALSE; 3356 C->was_assembled = PETSC_FALSE; 3357 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3358 mmdata->reusesym = PETSC_TRUE; 3359 C->offloadmask = PETSC_OFFLOAD_GPU; 3360 } 3361 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3362 PetscFunctionReturn(PETSC_SUCCESS); 3363 } 3364 3365 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3366 3367 /* handles sparse or dense B */ 3368 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3369 { 3370 Mat_Product *product = mat->product; 3371 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3372 3373 PetscFunctionBegin; 3374 MatCheckProduct(mat, 1); 3375 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3376 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3377 if (product->type == MATPRODUCT_ABC) { 3378 Ciscusp = PETSC_FALSE; 3379 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3380 } 3381 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3382 PetscBool usecpu = PETSC_FALSE; 3383 switch (product->type) { 3384 case MATPRODUCT_AB: 3385 if (product->api_user) { 3386 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3387 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3388 PetscOptionsEnd(); 3389 } else { 3390 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3391 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3392 PetscOptionsEnd(); 3393 } 3394 break; 3395 case MATPRODUCT_AtB: 3396 if (product->api_user) { 3397 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3398 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3399 PetscOptionsEnd(); 3400 } else { 3401 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3402 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3403 PetscOptionsEnd(); 3404 } 3405 break; 3406 case MATPRODUCT_PtAP: 3407 if (product->api_user) { 3408 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3409 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3410 PetscOptionsEnd(); 3411 } else { 3412 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3413 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3414 PetscOptionsEnd(); 3415 } 3416 break; 3417 case MATPRODUCT_RARt: 3418 if (product->api_user) { 3419 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3420 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3421 PetscOptionsEnd(); 3422 } else { 3423 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3424 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3425 PetscOptionsEnd(); 3426 } 3427 break; 3428 case MATPRODUCT_ABC: 3429 if (product->api_user) { 3430 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3431 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3432 PetscOptionsEnd(); 3433 } else { 3434 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3435 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3436 PetscOptionsEnd(); 3437 } 3438 break; 3439 default: 3440 break; 3441 } 3442 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3443 } 3444 /* dispatch */ 3445 if (isdense) { 3446 switch (product->type) { 3447 case MATPRODUCT_AB: 3448 case MATPRODUCT_AtB: 3449 case MATPRODUCT_ABt: 3450 case MATPRODUCT_PtAP: 3451 case MATPRODUCT_RARt: 3452 if (product->A->boundtocpu) { 3453 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3454 } else { 3455 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3456 } 3457 break; 3458 case MATPRODUCT_ABC: 3459 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3460 break; 3461 default: 3462 break; 3463 } 3464 } else if (Biscusp && Ciscusp) { 3465 switch (product->type) { 3466 case MATPRODUCT_AB: 3467 case MATPRODUCT_AtB: 3468 case MATPRODUCT_ABt: 3469 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3470 break; 3471 case MATPRODUCT_PtAP: 3472 case MATPRODUCT_RARt: 3473 case MATPRODUCT_ABC: 3474 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3475 break; 3476 default: 3477 break; 3478 } 3479 } else { /* fallback for AIJ */ 3480 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3481 } 3482 PetscFunctionReturn(PETSC_SUCCESS); 3483 } 3484 3485 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3486 { 3487 PetscFunctionBegin; 3488 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3489 PetscFunctionReturn(PETSC_SUCCESS); 3490 } 3491 3492 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3493 { 3494 PetscFunctionBegin; 3495 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3496 PetscFunctionReturn(PETSC_SUCCESS); 3497 } 3498 3499 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3500 { 3501 PetscFunctionBegin; 3502 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3503 PetscFunctionReturn(PETSC_SUCCESS); 3504 } 3505 3506 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3507 { 3508 PetscFunctionBegin; 3509 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3510 PetscFunctionReturn(PETSC_SUCCESS); 3511 } 3512 3513 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3514 { 3515 PetscFunctionBegin; 3516 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3517 PetscFunctionReturn(PETSC_SUCCESS); 3518 } 3519 3520 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3521 { 3522 int i = blockIdx.x * blockDim.x + threadIdx.x; 3523 if (i < n) y[idx[i]] += x[i]; 3524 } 3525 3526 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3527 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3528 { 3529 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3530 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3531 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3532 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3533 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3534 PetscBool compressed; 3535 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3536 PetscInt nx, ny; 3537 #endif 3538 3539 PetscFunctionBegin; 3540 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3541 if (!a->nz) { 3542 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3543 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3544 PetscFunctionReturn(PETSC_SUCCESS); 3545 } 3546 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3547 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3548 if (!trans) { 3549 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3550 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3551 } else { 3552 if (herm || !A->form_explicit_transpose) { 3553 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3554 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3555 } else { 3556 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3557 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3558 } 3559 } 3560 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3561 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3562 3563 try { 3564 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3565 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3566 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3567 3568 PetscCall(PetscLogGpuTimeBegin()); 3569 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3570 /* z = A x + beta y. 3571 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3572 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3573 */ 3574 xptr = xarray; 3575 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3576 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3577 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3578 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3579 allocated to accommodate different uses. So we get the length info directly from mat. 3580 */ 3581 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3582 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3583 nx = mat->num_cols; 3584 ny = mat->num_rows; 3585 } 3586 #endif 3587 } else { 3588 /* z = A^T x + beta y 3589 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3590 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3591 */ 3592 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3593 dptr = zarray; 3594 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3595 if (compressed) { /* Scatter x to work vector */ 3596 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3597 3598 thrust::for_each( 3599 #if PetscDefined(HAVE_THRUST_ASYNC) 3600 thrust::cuda::par.on(PetscDefaultCudaStream), 3601 #endif 3602 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3603 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3604 } 3605 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3606 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3607 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3608 nx = mat->num_rows; 3609 ny = mat->num_cols; 3610 } 3611 #endif 3612 } 3613 3614 /* csr_spmv does y = alpha op(A) x + beta y */ 3615 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3616 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3617 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3618 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3619 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3620 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3621 PetscCallCUSPARSE( 3622 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3623 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3624 3625 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3626 } else { 3627 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3628 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3629 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3630 } 3631 3632 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3633 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3634 #else 3635 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3636 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3637 #endif 3638 } else { 3639 if (cusparsestruct->nrows) { 3640 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3641 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3642 #else 3643 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3644 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3645 #endif 3646 } 3647 } 3648 PetscCall(PetscLogGpuTimeEnd()); 3649 3650 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3651 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3652 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3653 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3654 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3655 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3656 } 3657 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3658 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3659 } 3660 3661 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3662 if (compressed) { 3663 PetscCall(PetscLogGpuTimeBegin()); 3664 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3665 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3666 prevent that. So I just add a ScatterAdd kernel. 3667 */ 3668 #if 0 3669 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3670 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3671 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3672 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3673 VecCUDAPlusEquals()); 3674 #else 3675 PetscInt n = matstruct->cprowIndices->size(); 3676 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3677 #endif 3678 PetscCall(PetscLogGpuTimeEnd()); 3679 } 3680 } else { 3681 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3682 } 3683 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3684 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3685 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3686 } catch (char *ex) { 3687 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3688 } 3689 if (yy) { 3690 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3691 } else { 3692 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3693 } 3694 PetscFunctionReturn(PETSC_SUCCESS); 3695 } 3696 3697 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3698 { 3699 PetscFunctionBegin; 3700 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3701 PetscFunctionReturn(PETSC_SUCCESS); 3702 } 3703 3704 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3705 { 3706 PetscFunctionBegin; 3707 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3708 PetscFunctionReturn(PETSC_SUCCESS); 3709 } 3710 3711 /*@ 3712 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3713 (the default parallel PETSc format). 3714 3715 Collective 3716 3717 Input Parameters: 3718 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3719 . m - number of rows 3720 . n - number of columns 3721 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3722 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3723 3724 Output Parameter: 3725 . A - the matrix 3726 3727 Level: intermediate 3728 3729 Notes: 3730 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3731 calculations. For good matrix assembly performance the user should preallocate the matrix 3732 storage by setting the parameter `nz` (or the array `nnz`). 3733 3734 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3735 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3736 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3737 3738 The AIJ format, also called 3739 compressed row storage, is fully compatible with standard Fortran 3740 storage. That is, the stored row and column indices can begin at 3741 either one (as in Fortran) or zero. 3742 3743 Specify the preallocated storage with either nz or nnz (not both). 3744 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3745 allocation. 3746 3747 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 3748 @*/ 3749 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3750 { 3751 PetscFunctionBegin; 3752 PetscCall(MatCreate(comm, A)); 3753 PetscCall(MatSetSizes(*A, m, n, m, n)); 3754 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3755 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3756 PetscFunctionReturn(PETSC_SUCCESS); 3757 } 3758 3759 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3760 { 3761 PetscFunctionBegin; 3762 if (A->factortype == MAT_FACTOR_NONE) { 3763 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3764 } else { 3765 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3766 } 3767 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3768 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3769 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3770 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3771 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3772 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3773 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3774 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3775 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3776 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3777 PetscCall(MatDestroy_SeqAIJ(A)); 3778 PetscFunctionReturn(PETSC_SUCCESS); 3779 } 3780 3781 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3782 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3783 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3784 { 3785 PetscFunctionBegin; 3786 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3787 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3788 PetscFunctionReturn(PETSC_SUCCESS); 3789 } 3790 3791 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3792 { 3793 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3794 Mat_SeqAIJCUSPARSE *cy; 3795 Mat_SeqAIJCUSPARSE *cx; 3796 PetscScalar *ay; 3797 const PetscScalar *ax; 3798 CsrMatrix *csry, *csrx; 3799 3800 PetscFunctionBegin; 3801 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3802 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3803 if (X->ops->axpy != Y->ops->axpy) { 3804 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3805 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3806 PetscFunctionReturn(PETSC_SUCCESS); 3807 } 3808 /* if we are here, it means both matrices are bound to GPU */ 3809 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3810 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3811 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3812 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3813 csry = (CsrMatrix *)cy->mat->mat; 3814 csrx = (CsrMatrix *)cx->mat->mat; 3815 /* see if we can turn this into a cublas axpy */ 3816 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3817 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3818 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3819 if (eq) str = SAME_NONZERO_PATTERN; 3820 } 3821 /* spgeam is buggy with one column */ 3822 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3823 3824 if (str == SUBSET_NONZERO_PATTERN) { 3825 PetscScalar b = 1.0; 3826 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3827 size_t bufferSize; 3828 void *buffer; 3829 #endif 3830 3831 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3832 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3833 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3834 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3835 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3836 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3837 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3838 PetscCall(PetscLogGpuTimeBegin()); 3839 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3840 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3841 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3842 PetscCall(PetscLogGpuTimeEnd()); 3843 PetscCallCUDA(cudaFree(buffer)); 3844 #else 3845 PetscCall(PetscLogGpuTimeBegin()); 3846 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3847 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3848 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3849 PetscCall(PetscLogGpuTimeEnd()); 3850 #endif 3851 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3852 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3853 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3854 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3855 } else if (str == SAME_NONZERO_PATTERN) { 3856 cublasHandle_t cublasv2handle; 3857 PetscBLASInt one = 1, bnz = 1; 3858 3859 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3860 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3861 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3862 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3863 PetscCall(PetscLogGpuTimeBegin()); 3864 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3865 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3866 PetscCall(PetscLogGpuTimeEnd()); 3867 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3868 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3869 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3870 } else { 3871 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3872 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3873 } 3874 PetscFunctionReturn(PETSC_SUCCESS); 3875 } 3876 3877 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3878 { 3879 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3880 PetscScalar *ay; 3881 cublasHandle_t cublasv2handle; 3882 PetscBLASInt one = 1, bnz = 1; 3883 3884 PetscFunctionBegin; 3885 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3886 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3887 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3888 PetscCall(PetscLogGpuTimeBegin()); 3889 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3890 PetscCall(PetscLogGpuFlops(bnz)); 3891 PetscCall(PetscLogGpuTimeEnd()); 3892 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3893 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3894 PetscFunctionReturn(PETSC_SUCCESS); 3895 } 3896 3897 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3898 { 3899 PetscBool both = PETSC_FALSE; 3900 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3901 3902 PetscFunctionBegin; 3903 if (A->factortype == MAT_FACTOR_NONE) { 3904 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3905 if (spptr->mat) { 3906 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3907 if (matrix->values) { 3908 both = PETSC_TRUE; 3909 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3910 } 3911 } 3912 if (spptr->matTranspose) { 3913 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3914 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3915 } 3916 } 3917 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3918 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3919 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3920 else A->offloadmask = PETSC_OFFLOAD_CPU; 3921 PetscFunctionReturn(PETSC_SUCCESS); 3922 } 3923 3924 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3925 { 3926 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3927 3928 PetscFunctionBegin; 3929 if (A->factortype != MAT_FACTOR_NONE) { 3930 A->boundtocpu = flg; 3931 PetscFunctionReturn(PETSC_SUCCESS); 3932 } 3933 if (flg) { 3934 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3935 3936 A->ops->scale = MatScale_SeqAIJ; 3937 A->ops->axpy = MatAXPY_SeqAIJ; 3938 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3939 A->ops->mult = MatMult_SeqAIJ; 3940 A->ops->multadd = MatMultAdd_SeqAIJ; 3941 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3942 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3943 A->ops->multhermitiantranspose = NULL; 3944 A->ops->multhermitiantransposeadd = NULL; 3945 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3946 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3947 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3948 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3949 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3950 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3951 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3952 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3953 } else { 3954 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3955 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3956 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3957 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3958 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3959 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3960 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3961 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3962 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3963 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3964 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3965 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3966 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3967 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3968 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3969 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3970 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3971 3972 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3973 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3974 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3975 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3976 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3977 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3978 } 3979 A->boundtocpu = flg; 3980 if (flg && a->inode.size) { 3981 a->inode.use = PETSC_TRUE; 3982 } else { 3983 a->inode.use = PETSC_FALSE; 3984 } 3985 PetscFunctionReturn(PETSC_SUCCESS); 3986 } 3987 3988 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 3989 { 3990 Mat B; 3991 3992 PetscFunctionBegin; 3993 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3994 if (reuse == MAT_INITIAL_MATRIX) { 3995 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3996 } else if (reuse == MAT_REUSE_MATRIX) { 3997 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3998 } 3999 B = *newmat; 4000 4001 PetscCall(PetscFree(B->defaultvectype)); 4002 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4003 4004 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4005 if (B->factortype == MAT_FACTOR_NONE) { 4006 Mat_SeqAIJCUSPARSE *spptr; 4007 PetscCall(PetscNew(&spptr)); 4008 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4009 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4010 spptr->format = MAT_CUSPARSE_CSR; 4011 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4012 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4013 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4014 #else 4015 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4016 #endif 4017 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4018 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4019 #endif 4020 B->spptr = spptr; 4021 } else { 4022 Mat_SeqAIJCUSPARSETriFactors *spptr; 4023 4024 PetscCall(PetscNew(&spptr)); 4025 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4026 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4027 B->spptr = spptr; 4028 } 4029 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4030 } 4031 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4032 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4033 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4034 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4035 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4036 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4037 4038 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4039 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4040 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4041 #if defined(PETSC_HAVE_HYPRE) 4042 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4043 #endif 4044 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4045 PetscFunctionReturn(PETSC_SUCCESS); 4046 } 4047 4048 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4049 { 4050 PetscFunctionBegin; 4051 PetscCall(MatCreate_SeqAIJ(B)); 4052 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4053 PetscFunctionReturn(PETSC_SUCCESS); 4054 } 4055 4056 /*MC 4057 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4058 4059 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 4060 CSR, ELL, or Hybrid format. 4061 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4062 4063 Options Database Keys: 4064 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4065 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4066 Other options include ell (ellpack) or hyb (hybrid). 4067 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4068 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4069 4070 Level: beginner 4071 4072 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4073 M*/ 4074 4075 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4076 { 4077 PetscFunctionBegin; 4078 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4079 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4080 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4081 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4082 PetscFunctionReturn(PETSC_SUCCESS); 4083 } 4084 4085 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4086 { 4087 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4088 4089 PetscFunctionBegin; 4090 if (cusp) { 4091 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4092 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4093 delete cusp->workVector; 4094 delete cusp->rowoffsets_gpu; 4095 delete cusp->csr2csc_i; 4096 delete cusp->coords; 4097 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4098 PetscCall(PetscFree(mat->spptr)); 4099 } 4100 PetscFunctionReturn(PETSC_SUCCESS); 4101 } 4102 4103 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4104 { 4105 PetscFunctionBegin; 4106 if (*mat) { 4107 delete (*mat)->values; 4108 delete (*mat)->column_indices; 4109 delete (*mat)->row_offsets; 4110 delete *mat; 4111 *mat = 0; 4112 } 4113 PetscFunctionReturn(PETSC_SUCCESS); 4114 } 4115 4116 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4117 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4118 { 4119 PetscFunctionBegin; 4120 if (*trifactor) { 4121 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4122 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4123 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4124 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4125 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4126 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4127 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4128 #endif 4129 PetscCall(PetscFree(*trifactor)); 4130 } 4131 PetscFunctionReturn(PETSC_SUCCESS); 4132 } 4133 #endif 4134 4135 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4136 { 4137 CsrMatrix *mat; 4138 4139 PetscFunctionBegin; 4140 if (*matstruct) { 4141 if ((*matstruct)->mat) { 4142 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4143 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4144 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4145 #else 4146 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4147 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4148 #endif 4149 } else { 4150 mat = (CsrMatrix *)(*matstruct)->mat; 4151 PetscCall(CsrMatrix_Destroy(&mat)); 4152 } 4153 } 4154 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4155 delete (*matstruct)->cprowIndices; 4156 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4157 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4158 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4159 4160 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4161 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4162 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4163 for (int i = 0; i < 3; i++) { 4164 if (mdata->cuSpMV[i].initialized) { 4165 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4166 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4167 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4168 } 4169 } 4170 #endif 4171 delete *matstruct; 4172 *matstruct = NULL; 4173 } 4174 PetscFunctionReturn(PETSC_SUCCESS); 4175 } 4176 4177 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4178 { 4179 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4180 4181 PetscFunctionBegin; 4182 if (fs) { 4183 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4184 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4185 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4186 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4187 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4188 delete fs->workVector; 4189 fs->workVector = NULL; 4190 #endif 4191 delete fs->rpermIndices; 4192 delete fs->cpermIndices; 4193 fs->rpermIndices = NULL; 4194 fs->cpermIndices = NULL; 4195 fs->init_dev_prop = PETSC_FALSE; 4196 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4197 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4198 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4199 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4200 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4201 PetscCallCUDA(cudaFree(fs->csrVal)); 4202 PetscCallCUDA(cudaFree(fs->diag)); 4203 PetscCallCUDA(cudaFree(fs->X)); 4204 PetscCallCUDA(cudaFree(fs->Y)); 4205 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4206 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4207 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4208 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4209 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4210 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4211 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4212 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4213 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4214 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4215 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4216 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4217 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4218 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4219 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4220 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4221 PetscCall(PetscFree(fs->csrRowPtr_h)); 4222 PetscCall(PetscFree(fs->csrVal_h)); 4223 PetscCall(PetscFree(fs->diag_h)); 4224 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4225 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4226 #endif 4227 } 4228 PetscFunctionReturn(PETSC_SUCCESS); 4229 } 4230 4231 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4232 { 4233 PetscFunctionBegin; 4234 if (*trifactors) { 4235 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4236 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4237 PetscCall(PetscFree(*trifactors)); 4238 } 4239 PetscFunctionReturn(PETSC_SUCCESS); 4240 } 4241 4242 struct IJCompare { 4243 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4244 { 4245 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4246 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4247 return false; 4248 } 4249 }; 4250 4251 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4252 { 4253 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4254 4255 PetscFunctionBegin; 4256 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4257 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4258 if (destroy) { 4259 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4260 delete cusp->csr2csc_i; 4261 cusp->csr2csc_i = NULL; 4262 } 4263 A->transupdated = PETSC_FALSE; 4264 PetscFunctionReturn(PETSC_SUCCESS); 4265 } 4266 4267 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data) 4268 { 4269 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data; 4270 4271 PetscFunctionBegin; 4272 PetscCallCUDA(cudaFree(coo->perm)); 4273 PetscCallCUDA(cudaFree(coo->jmap)); 4274 PetscCall(PetscFree(coo)); 4275 PetscFunctionReturn(PETSC_SUCCESS); 4276 } 4277 4278 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4279 { 4280 PetscBool dev_ij = PETSC_FALSE; 4281 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4282 PetscInt *i, *j; 4283 PetscContainer container_h, container_d; 4284 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4285 4286 PetscFunctionBegin; 4287 // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter 4288 PetscCall(PetscGetMemType(coo_i, &mtype)); 4289 if (PetscMemTypeDevice(mtype)) { 4290 dev_ij = PETSC_TRUE; 4291 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4292 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4293 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4294 } else { 4295 i = coo_i; 4296 j = coo_j; 4297 } 4298 4299 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4300 if (dev_ij) PetscCall(PetscFree2(i, j)); 4301 mat->offloadmask = PETSC_OFFLOAD_CPU; 4302 // Create the GPU memory 4303 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4304 4305 // Copy the COO struct to device 4306 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4307 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4308 PetscCall(PetscMalloc1(1, &coo_d)); 4309 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4310 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4311 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4312 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4313 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4314 4315 // Put the COO struct in a container and then attach that to the matrix 4316 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 4317 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 4318 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4319 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 4320 PetscCall(PetscContainerDestroy(&container_d)); 4321 PetscFunctionReturn(PETSC_SUCCESS); 4322 } 4323 4324 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4325 { 4326 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4327 const PetscCount grid_size = gridDim.x * blockDim.x; 4328 for (; i < nnz; i += grid_size) { 4329 PetscScalar sum = 0.0; 4330 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4331 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4332 } 4333 } 4334 4335 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4336 { 4337 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4338 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4339 PetscCount Annz = seq->nz; 4340 PetscMemType memtype; 4341 const PetscScalar *v1 = v; 4342 PetscScalar *Aa; 4343 PetscContainer container; 4344 MatCOOStruct_SeqAIJ *coo; 4345 4346 PetscFunctionBegin; 4347 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4348 4349 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4350 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4351 4352 PetscCall(PetscGetMemType(v, &memtype)); 4353 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4354 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4355 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4356 } 4357 4358 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4359 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4360 4361 PetscCall(PetscLogGpuTimeBegin()); 4362 if (Annz) { 4363 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4364 PetscCallCUDA(cudaPeekAtLastError()); 4365 } 4366 PetscCall(PetscLogGpuTimeEnd()); 4367 4368 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4369 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4370 4371 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4372 PetscFunctionReturn(PETSC_SUCCESS); 4373 } 4374 4375 /*@C 4376 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4377 4378 Not Collective 4379 4380 Input Parameters: 4381 + A - the matrix 4382 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4383 4384 Output Parameters: 4385 + i - the CSR row pointers 4386 - j - the CSR column indices 4387 4388 Level: developer 4389 4390 Note: 4391 When compressed is true, the CSR structure does not contain empty rows 4392 4393 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4394 @*/ 4395 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4396 { 4397 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4398 CsrMatrix *csr; 4399 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4400 4401 PetscFunctionBegin; 4402 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4403 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4404 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4405 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4406 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4407 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4408 csr = (CsrMatrix *)cusp->mat->mat; 4409 if (i) { 4410 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4411 if (!cusp->rowoffsets_gpu) { 4412 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4413 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4414 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4415 } 4416 *i = cusp->rowoffsets_gpu->data().get(); 4417 } else *i = csr->row_offsets->data().get(); 4418 } 4419 if (j) *j = csr->column_indices->data().get(); 4420 PetscFunctionReturn(PETSC_SUCCESS); 4421 } 4422 4423 /*@C 4424 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4425 4426 Not Collective 4427 4428 Input Parameters: 4429 + A - the matrix 4430 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4431 . i - the CSR row pointers 4432 - j - the CSR column indices 4433 4434 Level: developer 4435 4436 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4437 @*/ 4438 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4439 { 4440 PetscFunctionBegin; 4441 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4442 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4443 if (i) *i = NULL; 4444 if (j) *j = NULL; 4445 (void)compressed; 4446 PetscFunctionReturn(PETSC_SUCCESS); 4447 } 4448 4449 /*@C 4450 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4451 4452 Not Collective 4453 4454 Input Parameter: 4455 . A - a `MATSEQAIJCUSPARSE` matrix 4456 4457 Output Parameter: 4458 . a - pointer to the device data 4459 4460 Level: developer 4461 4462 Note: 4463 May trigger host-device copies if up-to-date matrix data is on host 4464 4465 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4466 @*/ 4467 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4468 { 4469 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4470 CsrMatrix *csr; 4471 4472 PetscFunctionBegin; 4473 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4474 PetscAssertPointer(a, 2); 4475 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4476 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4477 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4478 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4479 csr = (CsrMatrix *)cusp->mat->mat; 4480 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4481 *a = csr->values->data().get(); 4482 PetscFunctionReturn(PETSC_SUCCESS); 4483 } 4484 4485 /*@C 4486 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4487 4488 Not Collective 4489 4490 Input Parameters: 4491 + A - a `MATSEQAIJCUSPARSE` matrix 4492 - a - pointer to the device data 4493 4494 Level: developer 4495 4496 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4497 @*/ 4498 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4499 { 4500 PetscFunctionBegin; 4501 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4502 PetscAssertPointer(a, 2); 4503 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4504 *a = NULL; 4505 PetscFunctionReturn(PETSC_SUCCESS); 4506 } 4507 4508 /*@C 4509 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4510 4511 Not Collective 4512 4513 Input Parameter: 4514 . A - a `MATSEQAIJCUSPARSE` matrix 4515 4516 Output Parameter: 4517 . a - pointer to the device data 4518 4519 Level: developer 4520 4521 Note: 4522 May trigger host-device copies if up-to-date matrix data is on host 4523 4524 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4525 @*/ 4526 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4527 { 4528 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4529 CsrMatrix *csr; 4530 4531 PetscFunctionBegin; 4532 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4533 PetscAssertPointer(a, 2); 4534 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4535 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4536 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4537 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4538 csr = (CsrMatrix *)cusp->mat->mat; 4539 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4540 *a = csr->values->data().get(); 4541 A->offloadmask = PETSC_OFFLOAD_GPU; 4542 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4543 PetscFunctionReturn(PETSC_SUCCESS); 4544 } 4545 /*@C 4546 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4547 4548 Not Collective 4549 4550 Input Parameters: 4551 + A - a `MATSEQAIJCUSPARSE` matrix 4552 - a - pointer to the device data 4553 4554 Level: developer 4555 4556 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4557 @*/ 4558 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4559 { 4560 PetscFunctionBegin; 4561 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4562 PetscAssertPointer(a, 2); 4563 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4564 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4565 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4566 *a = NULL; 4567 PetscFunctionReturn(PETSC_SUCCESS); 4568 } 4569 4570 /*@C 4571 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4572 4573 Not Collective 4574 4575 Input Parameter: 4576 . A - a `MATSEQAIJCUSPARSE` matrix 4577 4578 Output Parameter: 4579 . a - pointer to the device data 4580 4581 Level: developer 4582 4583 Note: 4584 Does not trigger host-device copies and flags data validity on the GPU 4585 4586 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4587 @*/ 4588 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4589 { 4590 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4591 CsrMatrix *csr; 4592 4593 PetscFunctionBegin; 4594 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4595 PetscAssertPointer(a, 2); 4596 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4597 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4598 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4599 csr = (CsrMatrix *)cusp->mat->mat; 4600 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4601 *a = csr->values->data().get(); 4602 A->offloadmask = PETSC_OFFLOAD_GPU; 4603 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4604 PetscFunctionReturn(PETSC_SUCCESS); 4605 } 4606 4607 /*@C 4608 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4609 4610 Not Collective 4611 4612 Input Parameters: 4613 + A - a `MATSEQAIJCUSPARSE` matrix 4614 - a - pointer to the device data 4615 4616 Level: developer 4617 4618 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4619 @*/ 4620 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4621 { 4622 PetscFunctionBegin; 4623 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4624 PetscAssertPointer(a, 2); 4625 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4626 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4627 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4628 *a = NULL; 4629 PetscFunctionReturn(PETSC_SUCCESS); 4630 } 4631 4632 struct IJCompare4 { 4633 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4634 { 4635 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4636 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4637 return false; 4638 } 4639 }; 4640 4641 struct Shift { 4642 int _shift; 4643 4644 Shift(int shift) : _shift(shift) { } 4645 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4646 }; 4647 4648 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4649 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4650 { 4651 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4652 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4653 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4654 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4655 PetscInt Annz, Bnnz; 4656 cusparseStatus_t stat; 4657 PetscInt i, m, n, zero = 0; 4658 4659 PetscFunctionBegin; 4660 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4661 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4662 PetscAssertPointer(C, 4); 4663 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4664 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4665 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4666 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4667 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4668 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4669 if (reuse == MAT_INITIAL_MATRIX) { 4670 m = A->rmap->n; 4671 n = A->cmap->n + B->cmap->n; 4672 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4673 PetscCall(MatSetSizes(*C, m, n, m, n)); 4674 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4675 c = (Mat_SeqAIJ *)(*C)->data; 4676 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4677 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4678 Ccsr = new CsrMatrix; 4679 Cmat->cprowIndices = NULL; 4680 c->compressedrow.use = PETSC_FALSE; 4681 c->compressedrow.nrows = 0; 4682 c->compressedrow.i = NULL; 4683 c->compressedrow.rindex = NULL; 4684 Ccusp->workVector = NULL; 4685 Ccusp->nrows = m; 4686 Ccusp->mat = Cmat; 4687 Ccusp->mat->mat = Ccsr; 4688 Ccsr->num_rows = m; 4689 Ccsr->num_cols = n; 4690 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4691 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4692 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4693 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4694 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4695 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4696 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4697 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4698 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4699 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4700 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4701 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4702 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4703 4704 Acsr = (CsrMatrix *)Acusp->mat->mat; 4705 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4706 Annz = (PetscInt)Acsr->column_indices->size(); 4707 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4708 c->nz = Annz + Bnnz; 4709 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4710 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4711 Ccsr->values = new THRUSTARRAY(c->nz); 4712 Ccsr->num_entries = c->nz; 4713 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4714 if (c->nz) { 4715 auto Acoo = new THRUSTINTARRAY32(Annz); 4716 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4717 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4718 THRUSTINTARRAY32 *Aroff, *Broff; 4719 4720 if (a->compressedrow.use) { /* need full row offset */ 4721 if (!Acusp->rowoffsets_gpu) { 4722 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4723 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4724 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4725 } 4726 Aroff = Acusp->rowoffsets_gpu; 4727 } else Aroff = Acsr->row_offsets; 4728 if (b->compressedrow.use) { /* need full row offset */ 4729 if (!Bcusp->rowoffsets_gpu) { 4730 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4731 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4732 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4733 } 4734 Broff = Bcusp->rowoffsets_gpu; 4735 } else Broff = Bcsr->row_offsets; 4736 PetscCall(PetscLogGpuTimeBegin()); 4737 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4738 PetscCallCUSPARSE(stat); 4739 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4740 PetscCallCUSPARSE(stat); 4741 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4742 auto Aperm = thrust::make_constant_iterator(1); 4743 auto Bperm = thrust::make_constant_iterator(0); 4744 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4745 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4746 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4747 #else 4748 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4749 auto Bcib = Bcsr->column_indices->begin(); 4750 auto Bcie = Bcsr->column_indices->end(); 4751 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4752 #endif 4753 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4754 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4755 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4756 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4757 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4758 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4759 auto p1 = Ccusp->coords->begin(); 4760 auto p2 = Ccusp->coords->begin(); 4761 thrust::advance(p2, Annz); 4762 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4763 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4764 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4765 #endif 4766 auto cci = thrust::make_counting_iterator(zero); 4767 auto cce = thrust::make_counting_iterator(c->nz); 4768 #if 0 //Errors on SUMMIT cuda 11.1.0 4769 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4770 #else 4771 auto pred = thrust::identity<int>(); 4772 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4773 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4774 #endif 4775 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4776 PetscCallCUSPARSE(stat); 4777 PetscCall(PetscLogGpuTimeEnd()); 4778 delete wPerm; 4779 delete Acoo; 4780 delete Bcoo; 4781 delete Ccoo; 4782 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4783 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4784 PetscCallCUSPARSE(stat); 4785 #endif 4786 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4787 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4788 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4789 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4790 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4791 CsrMatrix *CcsrT = new CsrMatrix; 4792 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4793 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4794 4795 (*C)->form_explicit_transpose = PETSC_TRUE; 4796 (*C)->transupdated = PETSC_TRUE; 4797 Ccusp->rowoffsets_gpu = NULL; 4798 CmatT->cprowIndices = NULL; 4799 CmatT->mat = CcsrT; 4800 CcsrT->num_rows = n; 4801 CcsrT->num_cols = m; 4802 CcsrT->num_entries = c->nz; 4803 4804 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4805 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4806 CcsrT->values = new THRUSTARRAY(c->nz); 4807 4808 PetscCall(PetscLogGpuTimeBegin()); 4809 auto rT = CcsrT->row_offsets->begin(); 4810 if (AT) { 4811 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4812 thrust::advance(rT, -1); 4813 } 4814 if (BT) { 4815 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4816 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4817 thrust::copy(titb, tite, rT); 4818 } 4819 auto cT = CcsrT->column_indices->begin(); 4820 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4821 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4822 auto vT = CcsrT->values->begin(); 4823 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4824 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4825 PetscCall(PetscLogGpuTimeEnd()); 4826 4827 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4828 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4829 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4830 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4831 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4832 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4833 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4834 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4835 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4836 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4837 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4838 PetscCallCUSPARSE(stat); 4839 #endif 4840 Ccusp->matTranspose = CmatT; 4841 } 4842 } 4843 4844 c->free_a = PETSC_TRUE; 4845 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4846 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4847 c->free_ij = PETSC_TRUE; 4848 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4849 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4850 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4851 ii = *Ccsr->row_offsets; 4852 jj = *Ccsr->column_indices; 4853 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4854 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4855 } else { 4856 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4857 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4858 } 4859 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4860 PetscCall(PetscMalloc1(m, &c->ilen)); 4861 PetscCall(PetscMalloc1(m, &c->imax)); 4862 c->maxnz = c->nz; 4863 c->nonzerorowcnt = 0; 4864 c->rmax = 0; 4865 for (i = 0; i < m; i++) { 4866 const PetscInt nn = c->i[i + 1] - c->i[i]; 4867 c->ilen[i] = c->imax[i] = nn; 4868 c->nonzerorowcnt += (PetscInt) !!nn; 4869 c->rmax = PetscMax(c->rmax, nn); 4870 } 4871 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4872 PetscCall(PetscMalloc1(c->nz, &c->a)); 4873 (*C)->nonzerostate++; 4874 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4875 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4876 Ccusp->nonzerostate = (*C)->nonzerostate; 4877 (*C)->preallocated = PETSC_TRUE; 4878 } else { 4879 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4880 c = (Mat_SeqAIJ *)(*C)->data; 4881 if (c->nz) { 4882 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4883 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4884 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4885 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4886 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4887 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4888 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4889 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4890 Acsr = (CsrMatrix *)Acusp->mat->mat; 4891 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4892 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4893 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4894 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4895 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4896 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4897 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4898 auto pmid = Ccusp->coords->begin(); 4899 thrust::advance(pmid, Acsr->num_entries); 4900 PetscCall(PetscLogGpuTimeBegin()); 4901 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4902 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4903 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4904 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4905 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4906 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4907 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4908 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4909 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4910 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4911 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4912 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4913 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4914 auto vT = CcsrT->values->begin(); 4915 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4916 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4917 (*C)->transupdated = PETSC_TRUE; 4918 } 4919 PetscCall(PetscLogGpuTimeEnd()); 4920 } 4921 } 4922 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4923 (*C)->assembled = PETSC_TRUE; 4924 (*C)->was_assembled = PETSC_FALSE; 4925 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4926 PetscFunctionReturn(PETSC_SUCCESS); 4927 } 4928 4929 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4930 { 4931 bool dmem; 4932 const PetscScalar *av; 4933 4934 PetscFunctionBegin; 4935 dmem = isCudaMem(v); 4936 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4937 if (n && idx) { 4938 THRUSTINTARRAY widx(n); 4939 widx.assign(idx, idx + n); 4940 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4941 4942 THRUSTARRAY *w = NULL; 4943 thrust::device_ptr<PetscScalar> dv; 4944 if (dmem) { 4945 dv = thrust::device_pointer_cast(v); 4946 } else { 4947 w = new THRUSTARRAY(n); 4948 dv = w->data(); 4949 } 4950 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4951 4952 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4953 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4954 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4955 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4956 delete w; 4957 } else { 4958 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4959 } 4960 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4961 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4962 PetscFunctionReturn(PETSC_SUCCESS); 4963 } 4964 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 4965