1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 28 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 29 30 typedef enum { 31 CUSPARSE_MV_ALG_DEFAULT = 0, 32 CUSPARSE_COOMV_ALG = 1, 33 CUSPARSE_CSRMV_ALG1 = 2, 34 CUSPARSE_CSRMV_ALG2 = 3 35 } cusparseSpMVAlg_t; 36 37 typedef enum { 38 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 39 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 40 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 41 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 42 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 43 CUSPARSE_SPMM_ALG_DEFAULT = 0, 44 CUSPARSE_SPMM_COO_ALG1 = 1, 45 CUSPARSE_SPMM_COO_ALG2 = 2, 46 CUSPARSE_SPMM_COO_ALG3 = 3, 47 CUSPARSE_SPMM_COO_ALG4 = 5, 48 CUSPARSE_SPMM_CSR_ALG1 = 4, 49 CUSPARSE_SPMM_CSR_ALG2 = 6, 50 } cusparseSpMMAlg_t; 51 52 typedef enum { 53 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 54 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 55 } cusparseCsr2CscAlg_t; 56 */ 57 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 58 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 60 #endif 61 62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 66 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 72 #endif 73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 83 84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 88 89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 91 92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 95 96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 97 { 98 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 99 100 PetscFunctionBegin; 101 switch (op) { 102 case MAT_CUSPARSE_MULT: 103 cusparsestruct->format = format; 104 break; 105 case MAT_CUSPARSE_ALL: 106 cusparsestruct->format = format; 107 break; 108 default: 109 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 110 } 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 /*@ 115 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 116 operation. Only the `MatMult()` operation can use different GPU storage formats 117 118 Not Collective 119 120 Input Parameters: 121 + A - Matrix of type `MATSEQAIJCUSPARSE` 122 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 123 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 124 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 125 126 Level: intermediate 127 128 .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129 @*/ 130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 131 { 132 PetscFunctionBegin; 133 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 134 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 139 { 140 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 141 142 PetscFunctionBegin; 143 cusparsestruct->use_cpu_solve = use_cpu; 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 /*@ 148 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 149 150 Input Parameters: 151 + A - Matrix of type `MATSEQAIJCUSPARSE` 152 - use_cpu - set flag for using the built-in CPU `MatSolve()` 153 154 Level: intermediate 155 156 Note: 157 The cuSparse LU solver currently computes the factors with the built-in CPU method 158 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160 161 .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 162 @*/ 163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 164 { 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 167 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 168 PetscFunctionReturn(PETSC_SUCCESS); 169 } 170 171 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 172 { 173 PetscFunctionBegin; 174 switch (op) { 175 case MAT_FORM_EXPLICIT_TRANSPOSE: 176 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 177 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 178 A->form_explicit_transpose = flg; 179 break; 180 default: 181 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 182 break; 183 } 184 PetscFunctionReturn(PETSC_SUCCESS); 185 } 186 187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 188 { 189 MatCUSPARSEStorageFormat format; 190 PetscBool flg; 191 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 192 193 PetscFunctionBegin; 194 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 195 if (A->factortype == MAT_FACTOR_NONE) { 196 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 197 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 198 199 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 200 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 201 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 202 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 204 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 205 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 206 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 207 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 208 #else 209 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 210 #endif 211 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 212 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 213 214 PetscCall( 215 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 216 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 217 #endif 218 } 219 PetscOptionsHeadEnd(); 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 225 { 226 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 227 PetscInt m = A->rmap->n; 228 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 229 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 230 const MatScalar *Aa = a->a; 231 PetscInt *Mi, *Mj, Mnz; 232 PetscScalar *Ma; 233 234 PetscFunctionBegin; 235 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 236 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 237 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 238 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 239 PetscCall(PetscMalloc1(m + 1, &Mi)); 240 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 241 PetscCall(PetscMalloc1(Mnz, &Ma)); 242 Mi[0] = 0; 243 for (PetscInt i = 0; i < m; i++) { 244 PetscInt llen = Ai[i + 1] - Ai[i]; 245 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 246 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 247 Mj[Mi[i] + llen] = i; // diagonal entry 248 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 249 Mi[i + 1] = Mi[i] + llen + ulen; 250 } 251 // Copy M (L,U) from host to device 252 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1))); 253 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz)); 254 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz)); 255 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice)); 256 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice)); 257 258 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 259 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 260 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 261 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 262 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 263 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 264 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 265 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 266 267 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 268 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 269 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 270 271 fillMode = CUSPARSE_FILL_MODE_UPPER; 272 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 273 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 274 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 275 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 276 277 // Allocate work vectors in SpSv 278 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m)); 279 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m)); 280 281 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 282 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 283 284 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 285 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 286 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 287 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 288 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 289 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 290 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 291 292 // Record for reuse 293 fs->csrRowPtr_h = Mi; 294 fs->csrVal_h = Ma; 295 PetscCall(PetscFree(Mj)); 296 } 297 // Copy the value 298 Mi = fs->csrRowPtr_h; 299 Ma = fs->csrVal_h; 300 Mnz = Mi[m]; 301 for (PetscInt i = 0; i < m; i++) { 302 PetscInt llen = Ai[i + 1] - Ai[i]; 303 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 304 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 305 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 306 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 307 } 308 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 309 310 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 311 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 312 313 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 314 315 // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve 316 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 317 } 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 #else 321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 322 { 323 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 324 PetscInt n = A->rmap->n; 325 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 326 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 327 const PetscInt *ai = a->i, *aj = a->j, *vi; 328 const MatScalar *aa = a->a, *v; 329 PetscInt *AiLo, *AjLo; 330 PetscInt i, nz, nzLower, offset, rowOffset; 331 332 PetscFunctionBegin; 333 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 334 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 335 try { 336 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 337 nzLower = n + ai[n] - ai[1]; 338 if (!loTriFactor) { 339 PetscScalar *AALo; 340 341 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 342 343 /* Allocate Space for the lower triangular matrix */ 344 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 345 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 346 347 /* Fill the lower triangular matrix */ 348 AiLo[0] = (PetscInt)0; 349 AiLo[n] = nzLower; 350 AjLo[0] = (PetscInt)0; 351 AALo[0] = (MatScalar)1.0; 352 v = aa; 353 vi = aj; 354 offset = 1; 355 rowOffset = 1; 356 for (i = 1; i < n; i++) { 357 nz = ai[i + 1] - ai[i]; 358 /* additional 1 for the term on the diagonal */ 359 AiLo[i] = rowOffset; 360 rowOffset += nz + 1; 361 362 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 363 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 364 365 offset += nz; 366 AjLo[offset] = (PetscInt)i; 367 AALo[offset] = (MatScalar)1.0; 368 offset += 1; 369 370 v += nz; 371 vi += nz; 372 } 373 374 /* allocate space for the triangular factor information */ 375 PetscCall(PetscNew(&loTriFactor)); 376 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 377 /* Create the matrix description */ 378 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 379 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 380 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 381 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 382 #else 383 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 384 #endif 385 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 386 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 387 388 /* set the operation */ 389 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 390 391 /* set the matrix */ 392 loTriFactor->csrMat = new CsrMatrix; 393 loTriFactor->csrMat->num_rows = n; 394 loTriFactor->csrMat->num_cols = n; 395 loTriFactor->csrMat->num_entries = nzLower; 396 397 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 398 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 399 400 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 401 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 402 403 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 404 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 405 406 /* Create the solve analysis information */ 407 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 408 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 409 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 410 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 411 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 412 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 413 #endif 414 415 /* perform the solve analysis */ 416 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 417 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 418 PetscCallCUDA(WaitForCUDA()); 419 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 420 421 /* assign the pointer */ 422 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 423 loTriFactor->AA_h = AALo; 424 PetscCallCUDA(cudaFreeHost(AiLo)); 425 PetscCallCUDA(cudaFreeHost(AjLo)); 426 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 427 } else { /* update values only */ 428 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 429 /* Fill the lower triangular matrix */ 430 loTriFactor->AA_h[0] = 1.0; 431 v = aa; 432 vi = aj; 433 offset = 1; 434 for (i = 1; i < n; i++) { 435 nz = ai[i + 1] - ai[i]; 436 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 437 offset += nz; 438 loTriFactor->AA_h[offset] = 1.0; 439 offset += 1; 440 v += nz; 441 } 442 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 443 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 444 } 445 } catch (char *ex) { 446 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 447 } 448 } 449 PetscFunctionReturn(PETSC_SUCCESS); 450 } 451 452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 453 { 454 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 455 PetscInt n = A->rmap->n; 456 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 457 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 458 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 459 const MatScalar *aa = a->a, *v; 460 PetscInt *AiUp, *AjUp; 461 PetscInt i, nz, nzUpper, offset; 462 463 PetscFunctionBegin; 464 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 465 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 466 try { 467 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 468 nzUpper = adiag[0] - adiag[n]; 469 if (!upTriFactor) { 470 PetscScalar *AAUp; 471 472 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 473 474 /* Allocate Space for the upper triangular matrix */ 475 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 476 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 477 478 /* Fill the upper triangular matrix */ 479 AiUp[0] = (PetscInt)0; 480 AiUp[n] = nzUpper; 481 offset = nzUpper; 482 for (i = n - 1; i >= 0; i--) { 483 v = aa + adiag[i + 1] + 1; 484 vi = aj + adiag[i + 1] + 1; 485 486 /* number of elements NOT on the diagonal */ 487 nz = adiag[i] - adiag[i + 1] - 1; 488 489 /* decrement the offset */ 490 offset -= (nz + 1); 491 492 /* first, set the diagonal elements */ 493 AjUp[offset] = (PetscInt)i; 494 AAUp[offset] = (MatScalar)1. / v[nz]; 495 AiUp[i] = AiUp[i + 1] - (nz + 1); 496 497 PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 498 PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 499 } 500 501 /* allocate space for the triangular factor information */ 502 PetscCall(PetscNew(&upTriFactor)); 503 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 504 505 /* Create the matrix description */ 506 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 507 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 508 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 509 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 510 #else 511 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 512 #endif 513 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 514 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 515 516 /* set the operation */ 517 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 518 519 /* set the matrix */ 520 upTriFactor->csrMat = new CsrMatrix; 521 upTriFactor->csrMat->num_rows = n; 522 upTriFactor->csrMat->num_cols = n; 523 upTriFactor->csrMat->num_entries = nzUpper; 524 525 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 526 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 527 528 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 529 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 530 531 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 532 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 533 534 /* Create the solve analysis information */ 535 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 536 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 537 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 538 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 539 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 540 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 541 #endif 542 543 /* perform the solve analysis */ 544 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 545 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 546 547 PetscCallCUDA(WaitForCUDA()); 548 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 549 550 /* assign the pointer */ 551 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 552 upTriFactor->AA_h = AAUp; 553 PetscCallCUDA(cudaFreeHost(AiUp)); 554 PetscCallCUDA(cudaFreeHost(AjUp)); 555 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 556 } else { 557 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 558 /* Fill the upper triangular matrix */ 559 offset = nzUpper; 560 for (i = n - 1; i >= 0; i--) { 561 v = aa + adiag[i + 1] + 1; 562 563 /* number of elements NOT on the diagonal */ 564 nz = adiag[i] - adiag[i + 1] - 1; 565 566 /* decrement the offset */ 567 offset -= (nz + 1); 568 569 /* first, set the diagonal elements */ 570 upTriFactor->AA_h[offset] = 1. / v[nz]; 571 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 572 } 573 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 574 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 575 } 576 } catch (char *ex) { 577 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 578 } 579 } 580 PetscFunctionReturn(PETSC_SUCCESS); 581 } 582 #endif 583 584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 585 { 586 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 587 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 588 IS isrow = a->row, iscol = a->icol; 589 PetscBool row_identity, col_identity; 590 PetscInt n = A->rmap->n; 591 592 PetscFunctionBegin; 593 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 594 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 595 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 596 #else 597 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 598 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 599 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 600 #endif 601 602 cusparseTriFactors->nnz = a->nz; 603 604 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 605 /* lower triangular indices */ 606 PetscCall(ISIdentity(isrow, &row_identity)); 607 if (!row_identity && !cusparseTriFactors->rpermIndices) { 608 const PetscInt *r; 609 610 PetscCall(ISGetIndices(isrow, &r)); 611 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 612 cusparseTriFactors->rpermIndices->assign(r, r + n); 613 PetscCall(ISRestoreIndices(isrow, &r)); 614 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 615 } 616 617 /* upper triangular indices */ 618 PetscCall(ISIdentity(iscol, &col_identity)); 619 if (!col_identity && !cusparseTriFactors->cpermIndices) { 620 const PetscInt *c; 621 622 PetscCall(ISGetIndices(iscol, &c)); 623 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 624 cusparseTriFactors->cpermIndices->assign(c, c + n); 625 PetscCall(ISRestoreIndices(iscol, &c)); 626 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 627 } 628 PetscFunctionReturn(PETSC_SUCCESS); 629 } 630 631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 633 { 634 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 635 PetscInt m = A->rmap->n; 636 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 637 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 638 const MatScalar *Aa = a->a; 639 PetscInt *Mj, Mnz; 640 PetscScalar *Ma, *D; 641 642 PetscFunctionBegin; 643 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 644 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 645 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 646 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 647 Mnz = Ai[m]; // Unz (with the unit diagonal) 648 PetscCall(PetscMalloc1(Mnz, &Ma)); 649 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 650 PetscCall(PetscMalloc1(m, &D)); // the diagonal 651 for (PetscInt i = 0; i < m; i++) { 652 PetscInt ulen = Ai[i + 1] - Ai[i]; 653 Mj[Ai[i]] = i; // diagonal entry 654 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 655 } 656 // Copy M (U) from host to device 657 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1))); 658 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz)); 659 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz)); 660 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m)); 661 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 662 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 663 664 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 665 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 666 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 667 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 668 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 669 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 670 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 671 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 672 673 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 674 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 675 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 676 677 // Allocate work vectors in SpSv 678 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m)); 679 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m)); 680 681 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 682 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 683 684 // Query buffer sizes for SpSV and then allocate buffers 685 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 686 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 687 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 688 689 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 690 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 691 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 692 693 // Record for reuse 694 fs->csrVal_h = Ma; 695 fs->diag_h = D; 696 PetscCall(PetscFree(Mj)); 697 } 698 // Copy the value 699 Ma = fs->csrVal_h; 700 D = fs->diag_h; 701 Mnz = Ai[m]; 702 for (PetscInt i = 0; i < m; i++) { 703 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 704 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 705 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 706 } 707 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 708 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 709 710 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 711 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 712 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 713 } 714 PetscFunctionReturn(PETSC_SUCCESS); 715 } 716 717 // Solve Ut D U x = b 718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 719 { 720 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 721 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 722 const PetscScalar *barray; 723 PetscScalar *xarray; 724 thrust::device_ptr<const PetscScalar> bGPU; 725 thrust::device_ptr<PetscScalar> xGPU; 726 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 727 PetscInt m = A->rmap->n; 728 729 PetscFunctionBegin; 730 PetscCall(PetscLogGpuTimeBegin()); 731 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 732 PetscCall(VecCUDAGetArrayRead(b, &barray)); 733 xGPU = thrust::device_pointer_cast(xarray); 734 bGPU = thrust::device_pointer_cast(barray); 735 736 // Reorder b with the row permutation if needed, and wrap the result in fs->X 737 if (fs->rpermIndices) { 738 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 739 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 740 } else { 741 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 742 } 743 744 // Solve Ut Y = X 745 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 746 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 747 748 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 749 // It is basically a vector element-wise multiplication, but cublas does not have it! 750 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 751 752 // Solve U X = Y 753 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 754 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 755 } else { 756 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 757 } 758 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 759 760 // Reorder X with the column permutation if needed, and put the result back to x 761 if (fs->cpermIndices) { 762 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 763 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 764 } 765 766 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 767 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 768 PetscCall(PetscLogGpuTimeEnd()); 769 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 770 PetscFunctionReturn(PETSC_SUCCESS); 771 } 772 #else 773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 774 { 775 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 776 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 777 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 778 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 779 PetscInt *AiUp, *AjUp; 780 PetscScalar *AAUp; 781 PetscScalar *AALo; 782 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 783 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 784 const PetscInt *ai = b->i, *aj = b->j, *vj; 785 const MatScalar *aa = b->a, *v; 786 787 PetscFunctionBegin; 788 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 789 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 790 try { 791 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 792 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 793 if (!upTriFactor && !loTriFactor) { 794 /* Allocate Space for the upper triangular matrix */ 795 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 796 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 797 798 /* Fill the upper triangular matrix */ 799 AiUp[0] = (PetscInt)0; 800 AiUp[n] = nzUpper; 801 offset = 0; 802 for (i = 0; i < n; i++) { 803 /* set the pointers */ 804 v = aa + ai[i]; 805 vj = aj + ai[i]; 806 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 807 808 /* first, set the diagonal elements */ 809 AjUp[offset] = (PetscInt)i; 810 AAUp[offset] = (MatScalar)1.0 / v[nz]; 811 AiUp[i] = offset; 812 AALo[offset] = (MatScalar)1.0 / v[nz]; 813 814 offset += 1; 815 if (nz > 0) { 816 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 817 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 818 for (j = offset; j < offset + nz; j++) { 819 AAUp[j] = -AAUp[j]; 820 AALo[j] = AAUp[j] / v[nz]; 821 } 822 offset += nz; 823 } 824 } 825 826 /* allocate space for the triangular factor information */ 827 PetscCall(PetscNew(&upTriFactor)); 828 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 829 830 /* Create the matrix description */ 831 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 832 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 833 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 834 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 835 #else 836 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 837 #endif 838 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 839 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 840 841 /* set the matrix */ 842 upTriFactor->csrMat = new CsrMatrix; 843 upTriFactor->csrMat->num_rows = A->rmap->n; 844 upTriFactor->csrMat->num_cols = A->cmap->n; 845 upTriFactor->csrMat->num_entries = a->nz; 846 847 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 848 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 849 850 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 852 853 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 855 856 /* set the operation */ 857 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 858 859 /* Create the solve analysis information */ 860 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 861 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 862 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 863 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 864 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 865 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 866 #endif 867 868 /* perform the solve analysis */ 869 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 870 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 871 872 PetscCallCUDA(WaitForCUDA()); 873 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 874 875 /* assign the pointer */ 876 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 877 878 /* allocate space for the triangular factor information */ 879 PetscCall(PetscNew(&loTriFactor)); 880 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 881 882 /* Create the matrix description */ 883 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 884 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 885 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 886 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 887 #else 888 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 889 #endif 890 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 891 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 892 893 /* set the operation */ 894 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 895 896 /* set the matrix */ 897 loTriFactor->csrMat = new CsrMatrix; 898 loTriFactor->csrMat->num_rows = A->rmap->n; 899 loTriFactor->csrMat->num_cols = A->cmap->n; 900 loTriFactor->csrMat->num_entries = a->nz; 901 902 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 903 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 904 905 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 906 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 907 908 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 909 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 910 911 /* Create the solve analysis information */ 912 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 913 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 914 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 915 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 916 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 917 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 918 #endif 919 920 /* perform the solve analysis */ 921 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 922 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 923 924 PetscCallCUDA(WaitForCUDA()); 925 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 926 927 /* assign the pointer */ 928 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 929 930 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 931 PetscCallCUDA(cudaFreeHost(AiUp)); 932 PetscCallCUDA(cudaFreeHost(AjUp)); 933 } else { 934 /* Fill the upper triangular matrix */ 935 offset = 0; 936 for (i = 0; i < n; i++) { 937 /* set the pointers */ 938 v = aa + ai[i]; 939 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 940 941 /* first, set the diagonal elements */ 942 AAUp[offset] = 1.0 / v[nz]; 943 AALo[offset] = 1.0 / v[nz]; 944 945 offset += 1; 946 if (nz > 0) { 947 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 948 for (j = offset; j < offset + nz; j++) { 949 AAUp[j] = -AAUp[j]; 950 AALo[j] = AAUp[j] / v[nz]; 951 } 952 offset += nz; 953 } 954 } 955 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 956 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 957 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 958 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 959 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 960 } 961 PetscCallCUDA(cudaFreeHost(AAUp)); 962 PetscCallCUDA(cudaFreeHost(AALo)); 963 } catch (char *ex) { 964 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 965 } 966 } 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 #endif 970 971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 972 { 973 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 974 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 975 IS ip = a->row; 976 PetscBool perm_identity; 977 PetscInt n = A->rmap->n; 978 979 PetscFunctionBegin; 980 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 981 982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 983 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 984 #else 985 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 986 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 987 #endif 988 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 989 990 A->offloadmask = PETSC_OFFLOAD_BOTH; 991 992 /* lower triangular indices */ 993 PetscCall(ISIdentity(ip, &perm_identity)); 994 if (!perm_identity) { 995 IS iip; 996 const PetscInt *irip, *rip; 997 998 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 999 PetscCall(ISGetIndices(iip, &irip)); 1000 PetscCall(ISGetIndices(ip, &rip)); 1001 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1002 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1003 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1004 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1005 PetscCall(ISRestoreIndices(iip, &irip)); 1006 PetscCall(ISDestroy(&iip)); 1007 PetscCall(ISRestoreIndices(ip, &rip)); 1008 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1009 } 1010 PetscFunctionReturn(PETSC_SUCCESS); 1011 } 1012 1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1014 { 1015 PetscFunctionBegin; 1016 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1017 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1018 B->offloadmask = PETSC_OFFLOAD_CPU; 1019 1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1021 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1022 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1023 #else 1024 /* determine which version of MatSolve needs to be used. */ 1025 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1026 IS ip = b->row; 1027 PetscBool perm_identity; 1028 1029 PetscCall(ISIdentity(ip, &perm_identity)); 1030 if (perm_identity) { 1031 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1032 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1033 } else { 1034 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1035 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1036 } 1037 #endif 1038 B->ops->matsolve = NULL; 1039 B->ops->matsolvetranspose = NULL; 1040 1041 /* get the triangular factors */ 1042 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1043 PetscFunctionReturn(PETSC_SUCCESS); 1044 } 1045 1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1048 { 1049 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1050 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1051 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1052 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1053 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1054 cusparseIndexBase_t indexBase; 1055 cusparseMatrixType_t matrixType; 1056 cusparseFillMode_t fillMode; 1057 cusparseDiagType_t diagType; 1058 1059 PetscFunctionBegin; 1060 /* allocate space for the transpose of the lower triangular factor */ 1061 PetscCall(PetscNew(&loTriFactorT)); 1062 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1063 1064 /* set the matrix descriptors of the lower triangular factor */ 1065 matrixType = cusparseGetMatType(loTriFactor->descr); 1066 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1067 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1068 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1069 1070 /* Create the matrix description */ 1071 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1072 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1073 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1074 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1075 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1076 1077 /* set the operation */ 1078 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1079 1080 /* allocate GPU space for the CSC of the lower triangular factor*/ 1081 loTriFactorT->csrMat = new CsrMatrix; 1082 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1083 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1084 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1085 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1086 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1087 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1088 1089 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1090 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1091 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1092 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1093 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1094 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1095 #endif 1096 1097 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1098 { 1099 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1100 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1101 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1102 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1103 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1104 #else 1105 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1106 #endif 1107 PetscCallCUSPARSE(stat); 1108 } 1109 1110 PetscCallCUDA(WaitForCUDA()); 1111 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1112 1113 /* Create the solve analysis information */ 1114 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1115 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1116 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1117 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1118 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1119 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1120 #endif 1121 1122 /* perform the solve analysis */ 1123 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1124 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1125 1126 PetscCallCUDA(WaitForCUDA()); 1127 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1128 1129 /* assign the pointer */ 1130 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1131 1132 /*********************************************/ 1133 /* Now the Transpose of the Upper Tri Factor */ 1134 /*********************************************/ 1135 1136 /* allocate space for the transpose of the upper triangular factor */ 1137 PetscCall(PetscNew(&upTriFactorT)); 1138 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1139 1140 /* set the matrix descriptors of the upper triangular factor */ 1141 matrixType = cusparseGetMatType(upTriFactor->descr); 1142 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1143 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1144 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1145 1146 /* Create the matrix description */ 1147 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1148 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1149 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1150 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1151 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1152 1153 /* set the operation */ 1154 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1155 1156 /* allocate GPU space for the CSC of the upper triangular factor*/ 1157 upTriFactorT->csrMat = new CsrMatrix; 1158 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1159 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1160 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1161 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1162 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1163 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1164 1165 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1166 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1167 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1168 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1169 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1170 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1171 #endif 1172 1173 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1174 { 1175 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1176 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1177 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1178 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1179 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1180 #else 1181 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1182 #endif 1183 PetscCallCUSPARSE(stat); 1184 } 1185 1186 PetscCallCUDA(WaitForCUDA()); 1187 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1188 1189 /* Create the solve analysis information */ 1190 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1191 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1192 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1193 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1194 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1195 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1196 #endif 1197 1198 /* perform the solve analysis */ 1199 /* christ, would it have killed you to put this stuff in a function????????? */ 1200 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1201 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1202 1203 PetscCallCUDA(WaitForCUDA()); 1204 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1205 1206 /* assign the pointer */ 1207 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1208 PetscFunctionReturn(PETSC_SUCCESS); 1209 } 1210 #endif 1211 1212 struct PetscScalarToPetscInt { 1213 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1214 }; 1215 1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1217 { 1218 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1219 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1221 cusparseStatus_t stat; 1222 cusparseIndexBase_t indexBase; 1223 1224 PetscFunctionBegin; 1225 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1226 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1227 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1228 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1229 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1230 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1231 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1232 PetscCall(PetscLogGpuTimeBegin()); 1233 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1234 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1235 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1236 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1237 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1238 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1239 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1240 1241 /* set alpha and beta */ 1242 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 1243 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 1244 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1245 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1246 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1247 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1248 1249 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1250 CsrMatrix *matrixT = new CsrMatrix; 1251 matstructT->mat = matrixT; 1252 matrixT->num_rows = A->cmap->n; 1253 matrixT->num_cols = A->rmap->n; 1254 matrixT->num_entries = a->nz; 1255 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1256 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1257 matrixT->values = new THRUSTARRAY(a->nz); 1258 1259 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1260 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1261 1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1264 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265 indexBase, cusparse_scalartype); 1266 PetscCallCUSPARSE(stat); 1267 #else 1268 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1269 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1270 1271 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1272 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1273 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1274 */ 1275 if (matrixT->num_entries) { 1276 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1277 PetscCallCUSPARSE(stat); 1278 1279 } else { 1280 matstructT->matDescr = NULL; 1281 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1282 } 1283 #endif 1284 #endif 1285 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1287 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1288 #else 1289 CsrMatrix *temp = new CsrMatrix; 1290 CsrMatrix *tempT = new CsrMatrix; 1291 /* First convert HYB to CSR */ 1292 temp->num_rows = A->rmap->n; 1293 temp->num_cols = A->cmap->n; 1294 temp->num_entries = a->nz; 1295 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1296 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1297 temp->values = new THRUSTARRAY(a->nz); 1298 1299 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1300 PetscCallCUSPARSE(stat); 1301 1302 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1303 tempT->num_rows = A->rmap->n; 1304 tempT->num_cols = A->cmap->n; 1305 tempT->num_entries = a->nz; 1306 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1307 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1308 tempT->values = new THRUSTARRAY(a->nz); 1309 1310 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1311 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1312 PetscCallCUSPARSE(stat); 1313 1314 /* Last, convert CSC to HYB */ 1315 cusparseHybMat_t hybMat; 1316 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1317 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1318 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1319 PetscCallCUSPARSE(stat); 1320 1321 /* assign the pointer */ 1322 matstructT->mat = hybMat; 1323 A->transupdated = PETSC_TRUE; 1324 /* delete temporaries */ 1325 if (tempT) { 1326 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1327 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1328 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1329 delete (CsrMatrix *)tempT; 1330 } 1331 if (temp) { 1332 if (temp->values) delete (THRUSTARRAY *)temp->values; 1333 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1334 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1335 delete (CsrMatrix *)temp; 1336 } 1337 #endif 1338 } 1339 } 1340 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1341 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1342 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1343 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1344 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1345 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1346 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1347 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1348 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1349 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1350 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1351 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1352 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1353 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1354 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1355 } 1356 if (!cusparsestruct->csr2csc_i) { 1357 THRUSTARRAY csr2csc_a(matrix->num_entries); 1358 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1359 1360 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1362 void *csr2cscBuffer; 1363 size_t csr2cscBufferSize; 1364 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1365 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1366 PetscCallCUSPARSE(stat); 1367 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1368 #endif 1369 1370 if (matrix->num_entries) { 1371 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1372 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1373 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1374 1375 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1376 should be filled with indexBase. So I just take a shortcut here. 1377 */ 1378 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1380 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1381 PetscCallCUSPARSE(stat); 1382 #else 1383 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1384 PetscCallCUSPARSE(stat); 1385 #endif 1386 } else { 1387 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1388 } 1389 1390 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1391 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1393 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1394 #endif 1395 } 1396 PetscCallThrust( 1397 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1398 } 1399 PetscCall(PetscLogGpuTimeEnd()); 1400 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1401 /* the compressed row indices is not used for matTranspose */ 1402 matstructT->cprowIndices = NULL; 1403 /* assign the pointer */ 1404 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1405 A->transupdated = PETSC_TRUE; 1406 PetscFunctionReturn(PETSC_SUCCESS); 1407 } 1408 1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1411 { 1412 const PetscScalar *barray; 1413 PetscScalar *xarray; 1414 thrust::device_ptr<const PetscScalar> bGPU; 1415 thrust::device_ptr<PetscScalar> xGPU; 1416 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1417 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1418 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1419 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1420 PetscInt m = A->rmap->n; 1421 1422 PetscFunctionBegin; 1423 PetscCall(PetscLogGpuTimeBegin()); 1424 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1425 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1426 xGPU = thrust::device_pointer_cast(xarray); 1427 bGPU = thrust::device_pointer_cast(barray); 1428 1429 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1430 if (fs->rpermIndices) { 1431 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1432 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1433 } else { 1434 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1435 } 1436 1437 // Solve L Y = X 1438 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1439 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1440 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1441 1442 // Solve U X = Y 1443 if (fs->cpermIndices) { 1444 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1445 } else { 1446 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1447 } 1448 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1449 1450 // Reorder X with the column permutation if needed, and put the result back to x 1451 if (fs->cpermIndices) { 1452 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1453 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1454 } 1455 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1456 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1457 PetscCall(PetscLogGpuTimeEnd()); 1458 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1459 PetscFunctionReturn(PETSC_SUCCESS); 1460 } 1461 1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1463 { 1464 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1465 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1466 const PetscScalar *barray; 1467 PetscScalar *xarray; 1468 thrust::device_ptr<const PetscScalar> bGPU; 1469 thrust::device_ptr<PetscScalar> xGPU; 1470 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1471 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1472 PetscInt m = A->rmap->n; 1473 1474 PetscFunctionBegin; 1475 PetscCall(PetscLogGpuTimeBegin()); 1476 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1477 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1478 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1479 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1480 1481 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1482 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1483 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1484 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1485 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1486 } 1487 1488 if (!fs->updatedTransposeSpSVAnalysis) { 1489 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1490 1491 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1492 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1493 } 1494 1495 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1496 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1497 xGPU = thrust::device_pointer_cast(xarray); 1498 bGPU = thrust::device_pointer_cast(barray); 1499 1500 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1501 if (fs->rpermIndices) { 1502 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1503 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1504 } else { 1505 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1506 } 1507 1508 // Solve Ut Y = X 1509 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1510 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1511 1512 // Solve Lt X = Y 1513 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1514 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1515 } else { 1516 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1517 } 1518 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1519 1520 // Reorder X with the column permutation if needed, and put the result back to x 1521 if (fs->cpermIndices) { 1522 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1523 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1524 } 1525 1526 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1527 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1528 PetscCall(PetscLogGpuTimeEnd()); 1529 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1530 PetscFunctionReturn(PETSC_SUCCESS); 1531 } 1532 #else 1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1535 { 1536 PetscInt n = xx->map->n; 1537 const PetscScalar *barray; 1538 PetscScalar *xarray; 1539 thrust::device_ptr<const PetscScalar> bGPU; 1540 thrust::device_ptr<PetscScalar> xGPU; 1541 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1542 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1543 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1544 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1545 1546 PetscFunctionBegin; 1547 /* Analyze the matrix and create the transpose ... on the fly */ 1548 if (!loTriFactorT && !upTriFactorT) { 1549 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1550 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1551 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1552 } 1553 1554 /* Get the GPU pointers */ 1555 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1556 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1557 xGPU = thrust::device_pointer_cast(xarray); 1558 bGPU = thrust::device_pointer_cast(barray); 1559 1560 PetscCall(PetscLogGpuTimeBegin()); 1561 /* First, reorder with the row permutation */ 1562 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1563 1564 /* First, solve U */ 1565 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1566 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1567 1568 /* Then, solve L */ 1569 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1570 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1571 1572 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1573 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1574 1575 /* Copy the temporary to the full solution. */ 1576 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1577 1578 /* restore */ 1579 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1580 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1581 PetscCall(PetscLogGpuTimeEnd()); 1582 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1583 PetscFunctionReturn(PETSC_SUCCESS); 1584 } 1585 1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1587 { 1588 const PetscScalar *barray; 1589 PetscScalar *xarray; 1590 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1591 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1593 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1594 1595 PetscFunctionBegin; 1596 /* Analyze the matrix and create the transpose ... on the fly */ 1597 if (!loTriFactorT && !upTriFactorT) { 1598 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1599 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1600 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1601 } 1602 1603 /* Get the GPU pointers */ 1604 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1605 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1606 1607 PetscCall(PetscLogGpuTimeBegin()); 1608 /* First, solve U */ 1609 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1610 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1611 1612 /* Then, solve L */ 1613 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1614 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1615 1616 /* restore */ 1617 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1618 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1619 PetscCall(PetscLogGpuTimeEnd()); 1620 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1625 { 1626 const PetscScalar *barray; 1627 PetscScalar *xarray; 1628 thrust::device_ptr<const PetscScalar> bGPU; 1629 thrust::device_ptr<PetscScalar> xGPU; 1630 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1631 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1633 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1634 1635 PetscFunctionBegin; 1636 /* Get the GPU pointers */ 1637 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1638 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1639 xGPU = thrust::device_pointer_cast(xarray); 1640 bGPU = thrust::device_pointer_cast(barray); 1641 1642 PetscCall(PetscLogGpuTimeBegin()); 1643 /* First, reorder with the row permutation */ 1644 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1645 1646 /* Next, solve L */ 1647 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1648 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1649 1650 /* Then, solve U */ 1651 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1652 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1653 1654 /* Last, reorder with the column permutation */ 1655 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1656 1657 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1658 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1659 PetscCall(PetscLogGpuTimeEnd()); 1660 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1661 PetscFunctionReturn(PETSC_SUCCESS); 1662 } 1663 1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1665 { 1666 const PetscScalar *barray; 1667 PetscScalar *xarray; 1668 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1669 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1670 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1671 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1672 1673 PetscFunctionBegin; 1674 /* Get the GPU pointers */ 1675 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1676 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1677 1678 PetscCall(PetscLogGpuTimeBegin()); 1679 /* First, solve L */ 1680 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1681 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1682 1683 /* Next, solve U */ 1684 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1685 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1686 1687 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1688 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1689 PetscCall(PetscLogGpuTimeEnd()); 1690 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1691 PetscFunctionReturn(PETSC_SUCCESS); 1692 } 1693 #endif 1694 1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1697 { 1698 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1699 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1700 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1701 CsrMatrix *Acsr; 1702 PetscInt m, nz; 1703 PetscBool flg; 1704 1705 PetscFunctionBegin; 1706 if (PetscDefined(USE_DEBUG)) { 1707 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1708 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1709 } 1710 1711 /* Copy A's value to fact */ 1712 m = fact->rmap->n; 1713 nz = aij->nz; 1714 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1715 Acsr = (CsrMatrix *)Acusp->mat->mat; 1716 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1717 1718 /* Factorize fact inplace */ 1719 if (m) 1720 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1721 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1722 if (PetscDefined(USE_DEBUG)) { 1723 int numerical_zero; 1724 cusparseStatus_t status; 1725 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1726 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1727 } 1728 1729 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1730 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1731 */ 1732 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1733 1734 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1735 1736 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1737 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1738 1739 fact->offloadmask = PETSC_OFFLOAD_GPU; 1740 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1741 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1742 fact->ops->matsolve = NULL; 1743 fact->ops->matsolvetranspose = NULL; 1744 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1745 PetscFunctionReturn(PETSC_SUCCESS); 1746 } 1747 1748 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1749 { 1750 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1751 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1752 PetscInt m, nz; 1753 1754 PetscFunctionBegin; 1755 if (PetscDefined(USE_DEBUG)) { 1756 PetscInt i; 1757 PetscBool flg, missing; 1758 1759 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1760 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1761 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1762 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1763 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1764 } 1765 1766 /* Free the old stale stuff */ 1767 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1768 1769 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1770 but they will not be used. Allocate them just for easy debugging. 1771 */ 1772 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1773 1774 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1775 fact->factortype = MAT_FACTOR_ILU; 1776 fact->info.factor_mallocs = 0; 1777 fact->info.fill_ratio_given = info->fill; 1778 fact->info.fill_ratio_needed = 1.0; 1779 1780 aij->row = NULL; 1781 aij->col = NULL; 1782 1783 /* ====================================================================== */ 1784 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1785 /* We'll do in-place factorization on fact */ 1786 /* ====================================================================== */ 1787 const int *Ai, *Aj; 1788 1789 m = fact->rmap->n; 1790 nz = aij->nz; 1791 1792 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1))); 1793 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz)); 1794 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz)); 1795 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1796 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1797 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1798 1799 /* ====================================================================== */ 1800 /* Create descriptors for M, L, U */ 1801 /* ====================================================================== */ 1802 cusparseFillMode_t fillMode; 1803 cusparseDiagType_t diagType; 1804 1805 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1806 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1807 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1808 1809 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1810 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1811 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1812 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1813 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1814 */ 1815 fillMode = CUSPARSE_FILL_MODE_LOWER; 1816 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1817 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1818 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1819 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1820 1821 fillMode = CUSPARSE_FILL_MODE_UPPER; 1822 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1823 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1824 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1825 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1826 1827 /* ========================================================================= */ 1828 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1829 /* ========================================================================= */ 1830 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1831 if (m) 1832 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1833 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1834 1835 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1836 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1837 1838 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1839 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1840 1841 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1842 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1843 1844 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1845 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1846 1847 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1848 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1849 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1850 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1851 */ 1852 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1853 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1854 fs->spsvBuffer_L = fs->factBuffer_M; 1855 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1856 } else { 1857 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1858 fs->spsvBuffer_U = fs->factBuffer_M; 1859 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1860 } 1861 1862 /* ========================================================================== */ 1863 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1864 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1865 /* ========================================================================== */ 1866 int structural_zero; 1867 cusparseStatus_t status; 1868 1869 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1870 if (m) 1871 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1872 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1873 if (PetscDefined(USE_DEBUG)) { 1874 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1875 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1876 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1877 } 1878 1879 /* Estimate FLOPs of the numeric factorization */ 1880 { 1881 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1882 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1883 PetscLogDouble flops = 0.0; 1884 1885 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1886 Ai = Aseq->i; 1887 Adiag = Aseq->diag; 1888 for (PetscInt i = 0; i < m; i++) { 1889 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1890 nzRow = Ai[i + 1] - Ai[i]; 1891 nzLeft = Adiag[i] - Ai[i]; 1892 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1893 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1894 */ 1895 nzLeft = (nzRow - 1) / 2; 1896 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1897 } 1898 } 1899 fs->numericFactFlops = flops; 1900 } 1901 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1902 PetscFunctionReturn(PETSC_SUCCESS); 1903 } 1904 1905 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1906 { 1907 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1908 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1909 const PetscScalar *barray; 1910 PetscScalar *xarray; 1911 1912 PetscFunctionBegin; 1913 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1914 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1915 PetscCall(PetscLogGpuTimeBegin()); 1916 1917 /* Solve L*y = b */ 1918 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1919 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1920 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1921 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1922 1923 /* Solve Lt*x = y */ 1924 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1925 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1926 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1927 1928 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1929 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1930 1931 PetscCall(PetscLogGpuTimeEnd()); 1932 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1933 PetscFunctionReturn(PETSC_SUCCESS); 1934 } 1935 1936 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1937 { 1938 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1939 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1940 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1941 CsrMatrix *Acsr; 1942 PetscInt m, nz; 1943 PetscBool flg; 1944 1945 PetscFunctionBegin; 1946 if (PetscDefined(USE_DEBUG)) { 1947 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1948 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1949 } 1950 1951 /* Copy A's value to fact */ 1952 m = fact->rmap->n; 1953 nz = aij->nz; 1954 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1955 Acsr = (CsrMatrix *)Acusp->mat->mat; 1956 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1957 1958 /* Factorize fact inplace */ 1959 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1960 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1961 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1962 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1963 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1964 */ 1965 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1966 if (PetscDefined(USE_DEBUG)) { 1967 int numerical_zero; 1968 cusparseStatus_t status; 1969 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1970 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1971 } 1972 1973 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1974 1975 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1976 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1977 */ 1978 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1979 1980 fact->offloadmask = PETSC_OFFLOAD_GPU; 1981 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1982 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1983 fact->ops->matsolve = NULL; 1984 fact->ops->matsolvetranspose = NULL; 1985 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1986 PetscFunctionReturn(PETSC_SUCCESS); 1987 } 1988 1989 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1990 { 1991 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1992 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1993 PetscInt m, nz; 1994 1995 PetscFunctionBegin; 1996 if (PetscDefined(USE_DEBUG)) { 1997 PetscInt i; 1998 PetscBool flg, missing; 1999 2000 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2001 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2002 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2003 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2004 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2005 } 2006 2007 /* Free the old stale stuff */ 2008 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2009 2010 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2011 but they will not be used. Allocate them just for easy debugging. 2012 */ 2013 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2014 2015 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2016 fact->factortype = MAT_FACTOR_ICC; 2017 fact->info.factor_mallocs = 0; 2018 fact->info.fill_ratio_given = info->fill; 2019 fact->info.fill_ratio_needed = 1.0; 2020 2021 aij->row = NULL; 2022 aij->col = NULL; 2023 2024 /* ====================================================================== */ 2025 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2026 /* We'll do in-place factorization on fact */ 2027 /* ====================================================================== */ 2028 const int *Ai, *Aj; 2029 2030 m = fact->rmap->n; 2031 nz = aij->nz; 2032 2033 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1))); 2034 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz)); 2035 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2036 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2037 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2038 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2039 2040 /* ====================================================================== */ 2041 /* Create mat descriptors for M, L */ 2042 /* ====================================================================== */ 2043 cusparseFillMode_t fillMode; 2044 cusparseDiagType_t diagType; 2045 2046 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2047 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2048 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2049 2050 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2051 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2052 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2053 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2054 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2055 */ 2056 fillMode = CUSPARSE_FILL_MODE_LOWER; 2057 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2058 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2059 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2060 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2061 2062 /* ========================================================================= */ 2063 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2064 /* ========================================================================= */ 2065 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2066 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2067 2068 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2069 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2070 2071 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2072 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2073 2074 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2075 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2076 2077 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2078 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2079 2080 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2081 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2082 */ 2083 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2084 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2085 fs->spsvBuffer_L = fs->factBuffer_M; 2086 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2087 } else { 2088 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2089 fs->spsvBuffer_Lt = fs->factBuffer_M; 2090 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2091 } 2092 2093 /* ========================================================================== */ 2094 /* Perform analysis of ic0 on M */ 2095 /* The lower triangular part of M has the same sparsity pattern as L */ 2096 /* ========================================================================== */ 2097 int structural_zero; 2098 cusparseStatus_t status; 2099 2100 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2101 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2102 if (PetscDefined(USE_DEBUG)) { 2103 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2104 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2105 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2106 } 2107 2108 /* Estimate FLOPs of the numeric factorization */ 2109 { 2110 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2111 PetscInt *Ai, nzRow, nzLeft; 2112 PetscLogDouble flops = 0.0; 2113 2114 Ai = Aseq->i; 2115 for (PetscInt i = 0; i < m; i++) { 2116 nzRow = Ai[i + 1] - Ai[i]; 2117 if (nzRow > 1) { 2118 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2119 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2120 */ 2121 nzLeft = (nzRow - 1) / 2; 2122 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2123 } 2124 } 2125 fs->numericFactFlops = flops; 2126 } 2127 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2128 PetscFunctionReturn(PETSC_SUCCESS); 2129 } 2130 #endif 2131 2132 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2133 { 2134 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2135 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2136 2137 PetscFunctionBegin; 2138 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2139 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2140 B->offloadmask = PETSC_OFFLOAD_CPU; 2141 2142 if (!cusparsestruct->use_cpu_solve) { 2143 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2144 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2145 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2146 #else 2147 /* determine which version of MatSolve needs to be used. */ 2148 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2149 IS isrow = b->row, iscol = b->col; 2150 PetscBool row_identity, col_identity; 2151 2152 PetscCall(ISIdentity(isrow, &row_identity)); 2153 PetscCall(ISIdentity(iscol, &col_identity)); 2154 if (row_identity && col_identity) { 2155 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2156 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2157 } else { 2158 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2159 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2160 } 2161 #endif 2162 } 2163 B->ops->matsolve = NULL; 2164 B->ops->matsolvetranspose = NULL; 2165 2166 /* get the triangular factors */ 2167 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2168 PetscFunctionReturn(PETSC_SUCCESS); 2169 } 2170 2171 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2172 { 2173 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2174 2175 PetscFunctionBegin; 2176 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2177 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2178 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2179 PetscFunctionReturn(PETSC_SUCCESS); 2180 } 2181 2182 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2183 { 2184 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2185 2186 PetscFunctionBegin; 2187 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2188 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2189 if (cusparseTriFactors->factorizeOnDevice) { 2190 PetscCall(ISIdentity(isrow, &row_identity)); 2191 PetscCall(ISIdentity(iscol, &col_identity)); 2192 } 2193 if (!info->levels && row_identity && col_identity) { 2194 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2195 } else 2196 #endif 2197 { 2198 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2199 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2200 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2201 } 2202 PetscFunctionReturn(PETSC_SUCCESS); 2203 } 2204 2205 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2206 { 2207 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2208 2209 PetscFunctionBegin; 2210 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2211 PetscBool perm_identity = PETSC_FALSE; 2212 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2213 if (!info->levels && perm_identity) { 2214 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2215 } else 2216 #endif 2217 { 2218 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2219 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2220 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2221 } 2222 PetscFunctionReturn(PETSC_SUCCESS); 2223 } 2224 2225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2226 { 2227 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2228 2229 PetscFunctionBegin; 2230 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2231 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2232 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2233 PetscFunctionReturn(PETSC_SUCCESS); 2234 } 2235 2236 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2237 { 2238 PetscFunctionBegin; 2239 *type = MATSOLVERCUSPARSE; 2240 PetscFunctionReturn(PETSC_SUCCESS); 2241 } 2242 2243 /*MC 2244 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2245 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2246 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2247 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2248 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2249 algorithms are not recommended. This class does NOT support direct solver operations. 2250 2251 Level: beginner 2252 2253 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2254 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2255 M*/ 2256 2257 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2258 { 2259 PetscInt n = A->rmap->n; 2260 PetscBool factOnDevice, factOnHost; 2261 char *prefix; 2262 char factPlace[32] = "device"; /* the default */ 2263 2264 PetscFunctionBegin; 2265 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2266 PetscCall(MatSetSizes(*B, n, n, n, n)); 2267 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2268 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2269 2270 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2271 PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2272 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2273 PetscOptionsEnd(); 2274 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2275 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2276 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2277 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2278 2279 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2280 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2281 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2282 if (!A->boundtocpu) { 2283 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2284 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2285 } else { 2286 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2287 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2288 } 2289 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2290 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2291 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2292 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2293 if (!A->boundtocpu) { 2294 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2295 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2296 } else { 2297 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2298 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2299 } 2300 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2301 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2302 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2303 2304 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2305 (*B)->canuseordering = PETSC_TRUE; 2306 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2307 PetscFunctionReturn(PETSC_SUCCESS); 2308 } 2309 2310 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2311 { 2312 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2313 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2314 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2315 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2316 #endif 2317 2318 PetscFunctionBegin; 2319 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2320 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2321 if (A->factortype == MAT_FACTOR_NONE) { 2322 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2323 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2324 } 2325 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2326 else if (fs->csrVal) { 2327 /* We have a factorized matrix on device and are able to copy it to host */ 2328 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2329 } 2330 #endif 2331 else 2332 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2333 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2334 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2335 A->offloadmask = PETSC_OFFLOAD_BOTH; 2336 } 2337 PetscFunctionReturn(PETSC_SUCCESS); 2338 } 2339 2340 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2341 { 2342 PetscFunctionBegin; 2343 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2344 *array = ((Mat_SeqAIJ *)A->data)->a; 2345 PetscFunctionReturn(PETSC_SUCCESS); 2346 } 2347 2348 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2349 { 2350 PetscFunctionBegin; 2351 A->offloadmask = PETSC_OFFLOAD_CPU; 2352 *array = NULL; 2353 PetscFunctionReturn(PETSC_SUCCESS); 2354 } 2355 2356 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2357 { 2358 PetscFunctionBegin; 2359 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2360 *array = ((Mat_SeqAIJ *)A->data)->a; 2361 PetscFunctionReturn(PETSC_SUCCESS); 2362 } 2363 2364 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2365 { 2366 PetscFunctionBegin; 2367 *array = NULL; 2368 PetscFunctionReturn(PETSC_SUCCESS); 2369 } 2370 2371 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2372 { 2373 PetscFunctionBegin; 2374 *array = ((Mat_SeqAIJ *)A->data)->a; 2375 PetscFunctionReturn(PETSC_SUCCESS); 2376 } 2377 2378 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2379 { 2380 PetscFunctionBegin; 2381 A->offloadmask = PETSC_OFFLOAD_CPU; 2382 *array = NULL; 2383 PetscFunctionReturn(PETSC_SUCCESS); 2384 } 2385 2386 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2387 { 2388 Mat_SeqAIJCUSPARSE *cusp; 2389 CsrMatrix *matrix; 2390 2391 PetscFunctionBegin; 2392 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2393 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2394 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2395 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2396 matrix = (CsrMatrix *)cusp->mat->mat; 2397 2398 if (i) { 2399 #if !defined(PETSC_USE_64BIT_INDICES) 2400 *i = matrix->row_offsets->data().get(); 2401 #else 2402 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2403 #endif 2404 } 2405 if (j) { 2406 #if !defined(PETSC_USE_64BIT_INDICES) 2407 *j = matrix->column_indices->data().get(); 2408 #else 2409 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2410 #endif 2411 } 2412 if (a) *a = matrix->values->data().get(); 2413 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2414 PetscFunctionReturn(PETSC_SUCCESS); 2415 } 2416 2417 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2418 { 2419 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2420 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2421 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2422 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2423 cusparseStatus_t stat; 2424 PetscBool both = PETSC_TRUE; 2425 2426 PetscFunctionBegin; 2427 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2428 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2429 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2430 CsrMatrix *matrix; 2431 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2432 2433 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2434 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2435 matrix->values->assign(a->a, a->a + a->nz); 2436 PetscCallCUDA(WaitForCUDA()); 2437 PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 2438 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2439 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2440 } else { 2441 PetscInt nnz; 2442 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2443 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2444 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2445 delete cusparsestruct->workVector; 2446 delete cusparsestruct->rowoffsets_gpu; 2447 cusparsestruct->workVector = NULL; 2448 cusparsestruct->rowoffsets_gpu = NULL; 2449 try { 2450 if (a->compressedrow.use) { 2451 m = a->compressedrow.nrows; 2452 ii = a->compressedrow.i; 2453 ridx = a->compressedrow.rindex; 2454 } else { 2455 m = A->rmap->n; 2456 ii = a->i; 2457 ridx = NULL; 2458 } 2459 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2460 if (!a->a) { 2461 nnz = ii[m]; 2462 both = PETSC_FALSE; 2463 } else nnz = a->nz; 2464 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2465 2466 /* create cusparse matrix */ 2467 cusparsestruct->nrows = m; 2468 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2469 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2470 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2471 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2472 2473 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 2474 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 2475 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 2476 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2477 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2478 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2479 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2480 2481 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2482 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2483 /* set the matrix */ 2484 CsrMatrix *mat = new CsrMatrix; 2485 mat->num_rows = m; 2486 mat->num_cols = A->cmap->n; 2487 mat->num_entries = nnz; 2488 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2489 mat->row_offsets->assign(ii, ii + m + 1); 2490 2491 mat->column_indices = new THRUSTINTARRAY32(nnz); 2492 mat->column_indices->assign(a->j, a->j + nnz); 2493 2494 mat->values = new THRUSTARRAY(nnz); 2495 if (a->a) mat->values->assign(a->a, a->a + nnz); 2496 2497 /* assign the pointer */ 2498 matstruct->mat = mat; 2499 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2500 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2501 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2502 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2503 PetscCallCUSPARSE(stat); 2504 } 2505 #endif 2506 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2507 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2508 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2509 #else 2510 CsrMatrix *mat = new CsrMatrix; 2511 mat->num_rows = m; 2512 mat->num_cols = A->cmap->n; 2513 mat->num_entries = nnz; 2514 mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2515 mat->row_offsets->assign(ii, ii + m + 1); 2516 2517 mat->column_indices = new THRUSTINTARRAY32(nnz); 2518 mat->column_indices->assign(a->j, a->j + nnz); 2519 2520 mat->values = new THRUSTARRAY(nnz); 2521 if (a->a) mat->values->assign(a->a, a->a + nnz); 2522 2523 cusparseHybMat_t hybMat; 2524 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2525 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2526 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2527 PetscCallCUSPARSE(stat); 2528 /* assign the pointer */ 2529 matstruct->mat = hybMat; 2530 2531 if (mat) { 2532 if (mat->values) delete (THRUSTARRAY *)mat->values; 2533 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2534 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2535 delete (CsrMatrix *)mat; 2536 } 2537 #endif 2538 } 2539 2540 /* assign the compressed row indices */ 2541 if (a->compressedrow.use) { 2542 cusparsestruct->workVector = new THRUSTARRAY(m); 2543 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2544 matstruct->cprowIndices->assign(ridx, ridx + m); 2545 tmp = m; 2546 } else { 2547 cusparsestruct->workVector = NULL; 2548 matstruct->cprowIndices = NULL; 2549 tmp = 0; 2550 } 2551 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2552 2553 /* assign the pointer */ 2554 cusparsestruct->mat = matstruct; 2555 } catch (char *ex) { 2556 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2557 } 2558 PetscCallCUDA(WaitForCUDA()); 2559 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2560 cusparsestruct->nonzerostate = A->nonzerostate; 2561 } 2562 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2563 } 2564 PetscFunctionReturn(PETSC_SUCCESS); 2565 } 2566 2567 struct VecCUDAPlusEquals { 2568 template <typename Tuple> 2569 __host__ __device__ void operator()(Tuple t) 2570 { 2571 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2572 } 2573 }; 2574 2575 struct VecCUDAEquals { 2576 template <typename Tuple> 2577 __host__ __device__ void operator()(Tuple t) 2578 { 2579 thrust::get<1>(t) = thrust::get<0>(t); 2580 } 2581 }; 2582 2583 struct VecCUDAEqualsReverse { 2584 template <typename Tuple> 2585 __host__ __device__ void operator()(Tuple t) 2586 { 2587 thrust::get<0>(t) = thrust::get<1>(t); 2588 } 2589 }; 2590 2591 struct MatMatCusparse { 2592 PetscBool cisdense; 2593 PetscScalar *Bt; 2594 Mat X; 2595 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2596 PetscLogDouble flops; 2597 CsrMatrix *Bcsr; 2598 2599 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2600 cusparseSpMatDescr_t matSpBDescr; 2601 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2602 cusparseDnMatDescr_t matBDescr; 2603 cusparseDnMatDescr_t matCDescr; 2604 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2605 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2606 void *dBuffer4; 2607 void *dBuffer5; 2608 #endif 2609 size_t mmBufferSize; 2610 void *mmBuffer; 2611 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2612 cusparseSpGEMMDescr_t spgemmDesc; 2613 #endif 2614 }; 2615 2616 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2617 { 2618 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2619 2620 PetscFunctionBegin; 2621 PetscCallCUDA(cudaFree(mmdata->Bt)); 2622 delete mmdata->Bcsr; 2623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2624 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2625 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2626 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2627 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2628 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2629 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2630 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2631 #endif 2632 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2633 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2634 #endif 2635 PetscCall(MatDestroy(&mmdata->X)); 2636 PetscCall(PetscFree(data)); 2637 PetscFunctionReturn(PETSC_SUCCESS); 2638 } 2639 2640 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2641 2642 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2643 { 2644 Mat_Product *product = C->product; 2645 Mat A, B; 2646 PetscInt m, n, blda, clda; 2647 PetscBool flg, biscuda; 2648 Mat_SeqAIJCUSPARSE *cusp; 2649 cusparseStatus_t stat; 2650 cusparseOperation_t opA; 2651 const PetscScalar *barray; 2652 PetscScalar *carray; 2653 MatMatCusparse *mmdata; 2654 Mat_SeqAIJCUSPARSEMultStruct *mat; 2655 CsrMatrix *csrmat; 2656 2657 PetscFunctionBegin; 2658 MatCheckProduct(C, 1); 2659 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2660 mmdata = (MatMatCusparse *)product->data; 2661 A = product->A; 2662 B = product->B; 2663 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2664 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2665 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2666 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2667 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2668 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2669 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2670 switch (product->type) { 2671 case MATPRODUCT_AB: 2672 case MATPRODUCT_PtAP: 2673 mat = cusp->mat; 2674 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2675 m = A->rmap->n; 2676 n = B->cmap->n; 2677 break; 2678 case MATPRODUCT_AtB: 2679 if (!A->form_explicit_transpose) { 2680 mat = cusp->mat; 2681 opA = CUSPARSE_OPERATION_TRANSPOSE; 2682 } else { 2683 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2684 mat = cusp->matTranspose; 2685 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2686 } 2687 m = A->cmap->n; 2688 n = B->cmap->n; 2689 break; 2690 case MATPRODUCT_ABt: 2691 case MATPRODUCT_RARt: 2692 mat = cusp->mat; 2693 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2694 m = A->rmap->n; 2695 n = B->rmap->n; 2696 break; 2697 default: 2698 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2699 } 2700 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2701 csrmat = (CsrMatrix *)mat->mat; 2702 /* if the user passed a CPU matrix, copy the data to the GPU */ 2703 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2704 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2705 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2706 2707 PetscCall(MatDenseGetLDA(B, &blda)); 2708 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2709 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2710 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2711 } else { 2712 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2713 PetscCall(MatDenseGetLDA(C, &clda)); 2714 } 2715 2716 PetscCall(PetscLogGpuTimeBegin()); 2717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2718 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2719 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2720 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2721 size_t mmBufferSize; 2722 if (mmdata->initialized && mmdata->Blda != blda) { 2723 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2724 mmdata->matBDescr = NULL; 2725 } 2726 if (!mmdata->matBDescr) { 2727 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2728 mmdata->Blda = blda; 2729 } 2730 2731 if (mmdata->initialized && mmdata->Clda != clda) { 2732 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2733 mmdata->matCDescr = NULL; 2734 } 2735 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2736 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2737 mmdata->Clda = clda; 2738 } 2739 2740 if (!mat->matDescr) { 2741 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2742 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2743 PetscCallCUSPARSE(stat); 2744 } 2745 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2746 PetscCallCUSPARSE(stat); 2747 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2748 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2749 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2750 mmdata->mmBufferSize = mmBufferSize; 2751 } 2752 mmdata->initialized = PETSC_TRUE; 2753 } else { 2754 /* to be safe, always update pointers of the mats */ 2755 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2756 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2757 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2758 } 2759 2760 /* do cusparseSpMM, which supports transpose on B */ 2761 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2762 PetscCallCUSPARSE(stat); 2763 #else 2764 PetscInt k; 2765 /* cusparseXcsrmm does not support transpose on B */ 2766 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2767 cublasHandle_t cublasv2handle; 2768 cublasStatus_t cerr; 2769 2770 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2771 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2772 PetscCallCUBLAS(cerr); 2773 blda = B->cmap->n; 2774 k = B->cmap->n; 2775 } else { 2776 k = B->rmap->n; 2777 } 2778 2779 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2780 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2781 PetscCallCUSPARSE(stat); 2782 #endif 2783 PetscCall(PetscLogGpuTimeEnd()); 2784 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2785 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2786 if (product->type == MATPRODUCT_RARt) { 2787 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2788 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2789 } else if (product->type == MATPRODUCT_PtAP) { 2790 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2791 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2792 } else { 2793 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2794 } 2795 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2796 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2797 PetscFunctionReturn(PETSC_SUCCESS); 2798 } 2799 2800 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2801 { 2802 Mat_Product *product = C->product; 2803 Mat A, B; 2804 PetscInt m, n; 2805 PetscBool cisdense, flg; 2806 MatMatCusparse *mmdata; 2807 Mat_SeqAIJCUSPARSE *cusp; 2808 2809 PetscFunctionBegin; 2810 MatCheckProduct(C, 1); 2811 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2812 A = product->A; 2813 B = product->B; 2814 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2815 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2816 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2817 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2818 switch (product->type) { 2819 case MATPRODUCT_AB: 2820 m = A->rmap->n; 2821 n = B->cmap->n; 2822 break; 2823 case MATPRODUCT_AtB: 2824 m = A->cmap->n; 2825 n = B->cmap->n; 2826 break; 2827 case MATPRODUCT_ABt: 2828 m = A->rmap->n; 2829 n = B->rmap->n; 2830 break; 2831 case MATPRODUCT_PtAP: 2832 m = B->cmap->n; 2833 n = B->cmap->n; 2834 break; 2835 case MATPRODUCT_RARt: 2836 m = B->rmap->n; 2837 n = B->rmap->n; 2838 break; 2839 default: 2840 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2841 } 2842 PetscCall(MatSetSizes(C, m, n, m, n)); 2843 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2844 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2845 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2846 2847 /* product data */ 2848 PetscCall(PetscNew(&mmdata)); 2849 mmdata->cisdense = cisdense; 2850 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2851 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2852 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2853 #endif 2854 /* for these products we need intermediate storage */ 2855 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2856 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2857 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2858 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2859 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2860 } else { 2861 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2862 } 2863 } 2864 C->product->data = mmdata; 2865 C->product->destroy = MatDestroy_MatMatCusparse; 2866 2867 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2868 PetscFunctionReturn(PETSC_SUCCESS); 2869 } 2870 2871 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2872 { 2873 Mat_Product *product = C->product; 2874 Mat A, B; 2875 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2876 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2877 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2878 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2879 PetscBool flg; 2880 cusparseStatus_t stat; 2881 MatProductType ptype; 2882 MatMatCusparse *mmdata; 2883 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2884 cusparseSpMatDescr_t BmatSpDescr; 2885 #endif 2886 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2887 2888 PetscFunctionBegin; 2889 MatCheckProduct(C, 1); 2890 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2891 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2892 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2893 mmdata = (MatMatCusparse *)C->product->data; 2894 A = product->A; 2895 B = product->B; 2896 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2897 mmdata->reusesym = PETSC_FALSE; 2898 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2899 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2900 Cmat = Ccusp->mat; 2901 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2902 Ccsr = (CsrMatrix *)Cmat->mat; 2903 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2904 goto finalize; 2905 } 2906 if (!c->nz) goto finalize; 2907 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2908 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2909 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2910 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2911 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2912 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2913 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2914 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2915 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2916 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2917 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2918 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2919 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2920 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2921 2922 ptype = product->type; 2923 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2924 ptype = MATPRODUCT_AB; 2925 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2926 } 2927 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2928 ptype = MATPRODUCT_AB; 2929 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2930 } 2931 switch (ptype) { 2932 case MATPRODUCT_AB: 2933 Amat = Acusp->mat; 2934 Bmat = Bcusp->mat; 2935 break; 2936 case MATPRODUCT_AtB: 2937 Amat = Acusp->matTranspose; 2938 Bmat = Bcusp->mat; 2939 break; 2940 case MATPRODUCT_ABt: 2941 Amat = Acusp->mat; 2942 Bmat = Bcusp->matTranspose; 2943 break; 2944 default: 2945 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2946 } 2947 Cmat = Ccusp->mat; 2948 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2949 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2950 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2951 Acsr = (CsrMatrix *)Amat->mat; 2952 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2953 Ccsr = (CsrMatrix *)Cmat->mat; 2954 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2955 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2956 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2957 PetscCall(PetscLogGpuTimeBegin()); 2958 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2959 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2960 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2961 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2962 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2963 PetscCallCUSPARSE(stat); 2964 #else 2965 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2966 PetscCallCUSPARSE(stat); 2967 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2968 PetscCallCUSPARSE(stat); 2969 #endif 2970 #else 2971 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2972 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2973 PetscCallCUSPARSE(stat); 2974 #endif 2975 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2976 PetscCallCUDA(WaitForCUDA()); 2977 PetscCall(PetscLogGpuTimeEnd()); 2978 C->offloadmask = PETSC_OFFLOAD_GPU; 2979 finalize: 2980 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2981 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2982 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2983 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2984 c->reallocs = 0; 2985 C->info.mallocs += 0; 2986 C->info.nz_unneeded = 0; 2987 C->assembled = C->was_assembled = PETSC_TRUE; 2988 C->num_ass++; 2989 PetscFunctionReturn(PETSC_SUCCESS); 2990 } 2991 2992 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2993 { 2994 Mat_Product *product = C->product; 2995 Mat A, B; 2996 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2997 Mat_SeqAIJ *a, *b, *c; 2998 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2999 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3000 PetscInt i, j, m, n, k; 3001 PetscBool flg; 3002 cusparseStatus_t stat; 3003 MatProductType ptype; 3004 MatMatCusparse *mmdata; 3005 PetscLogDouble flops; 3006 PetscBool biscompressed, ciscompressed; 3007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3008 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3009 cusparseSpMatDescr_t BmatSpDescr; 3010 #else 3011 int cnz; 3012 #endif 3013 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3014 3015 PetscFunctionBegin; 3016 MatCheckProduct(C, 1); 3017 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3018 A = product->A; 3019 B = product->B; 3020 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3021 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3022 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3023 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3024 a = (Mat_SeqAIJ *)A->data; 3025 b = (Mat_SeqAIJ *)B->data; 3026 /* product data */ 3027 PetscCall(PetscNew(&mmdata)); 3028 C->product->data = mmdata; 3029 C->product->destroy = MatDestroy_MatMatCusparse; 3030 3031 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3032 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3033 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3034 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3035 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3036 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3037 3038 ptype = product->type; 3039 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3040 ptype = MATPRODUCT_AB; 3041 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3042 } 3043 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3044 ptype = MATPRODUCT_AB; 3045 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3046 } 3047 biscompressed = PETSC_FALSE; 3048 ciscompressed = PETSC_FALSE; 3049 switch (ptype) { 3050 case MATPRODUCT_AB: 3051 m = A->rmap->n; 3052 n = B->cmap->n; 3053 k = A->cmap->n; 3054 Amat = Acusp->mat; 3055 Bmat = Bcusp->mat; 3056 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3057 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3058 break; 3059 case MATPRODUCT_AtB: 3060 m = A->cmap->n; 3061 n = B->cmap->n; 3062 k = A->rmap->n; 3063 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3064 Amat = Acusp->matTranspose; 3065 Bmat = Bcusp->mat; 3066 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3067 break; 3068 case MATPRODUCT_ABt: 3069 m = A->rmap->n; 3070 n = B->rmap->n; 3071 k = A->cmap->n; 3072 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3073 Amat = Acusp->mat; 3074 Bmat = Bcusp->matTranspose; 3075 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3076 break; 3077 default: 3078 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3079 } 3080 3081 /* create cusparse matrix */ 3082 PetscCall(MatSetSizes(C, m, n, m, n)); 3083 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3084 c = (Mat_SeqAIJ *)C->data; 3085 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3086 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3087 Ccsr = new CsrMatrix; 3088 3089 c->compressedrow.use = ciscompressed; 3090 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3091 c->compressedrow.nrows = a->compressedrow.nrows; 3092 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3093 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3094 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3095 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3096 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3097 } else { 3098 c->compressedrow.nrows = 0; 3099 c->compressedrow.i = NULL; 3100 c->compressedrow.rindex = NULL; 3101 Ccusp->workVector = NULL; 3102 Cmat->cprowIndices = NULL; 3103 } 3104 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3105 Ccusp->mat = Cmat; 3106 Ccusp->mat->mat = Ccsr; 3107 Ccsr->num_rows = Ccusp->nrows; 3108 Ccsr->num_cols = n; 3109 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3110 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3111 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3112 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3113 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 3114 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 3115 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 3116 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3117 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3118 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3119 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3120 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3121 c->nz = 0; 3122 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3123 Ccsr->values = new THRUSTARRAY(c->nz); 3124 goto finalizesym; 3125 } 3126 3127 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3128 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3129 Acsr = (CsrMatrix *)Amat->mat; 3130 if (!biscompressed) { 3131 Bcsr = (CsrMatrix *)Bmat->mat; 3132 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3133 BmatSpDescr = Bmat->matDescr; 3134 #endif 3135 } else { /* we need to use row offsets for the full matrix */ 3136 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3137 Bcsr = new CsrMatrix; 3138 Bcsr->num_rows = B->rmap->n; 3139 Bcsr->num_cols = cBcsr->num_cols; 3140 Bcsr->num_entries = cBcsr->num_entries; 3141 Bcsr->column_indices = cBcsr->column_indices; 3142 Bcsr->values = cBcsr->values; 3143 if (!Bcusp->rowoffsets_gpu) { 3144 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3145 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3146 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3147 } 3148 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3149 mmdata->Bcsr = Bcsr; 3150 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3151 if (Bcsr->num_rows && Bcsr->num_cols) { 3152 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3153 PetscCallCUSPARSE(stat); 3154 } 3155 BmatSpDescr = mmdata->matSpBDescr; 3156 #endif 3157 } 3158 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3159 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3160 /* precompute flops count */ 3161 if (ptype == MATPRODUCT_AB) { 3162 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3163 const PetscInt st = a->i[i]; 3164 const PetscInt en = a->i[i + 1]; 3165 for (j = st; j < en; j++) { 3166 const PetscInt brow = a->j[j]; 3167 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3168 } 3169 } 3170 } else if (ptype == MATPRODUCT_AtB) { 3171 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3172 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3173 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3174 flops += (2. * anzi) * bnzi; 3175 } 3176 } else { /* TODO */ 3177 flops = 0.; 3178 } 3179 3180 mmdata->flops = flops; 3181 PetscCall(PetscLogGpuTimeBegin()); 3182 3183 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3184 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3185 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3186 PetscCallCUSPARSE(stat); 3187 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3188 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3189 { 3190 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3191 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3192 */ 3193 void *dBuffer1 = NULL; 3194 void *dBuffer2 = NULL; 3195 void *dBuffer3 = NULL; 3196 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3197 size_t bufferSize1 = 0; 3198 size_t bufferSize2 = 0; 3199 size_t bufferSize3 = 0; 3200 size_t bufferSize4 = 0; 3201 size_t bufferSize5 = 0; 3202 3203 /* ask bufferSize1 bytes for external memory */ 3204 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3205 PetscCallCUSPARSE(stat); 3206 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3207 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3208 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3209 PetscCallCUSPARSE(stat); 3210 3211 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3212 PetscCallCUSPARSE(stat); 3213 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3214 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3215 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3216 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3217 PetscCallCUSPARSE(stat); 3218 PetscCallCUDA(cudaFree(dBuffer1)); 3219 PetscCallCUDA(cudaFree(dBuffer2)); 3220 3221 /* get matrix C non-zero entries C_nnz1 */ 3222 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3223 c->nz = (PetscInt)C_nnz1; 3224 /* allocate matrix C */ 3225 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3226 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3227 Ccsr->values = new THRUSTARRAY(c->nz); 3228 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3229 /* update matC with the new pointers */ 3230 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3231 PetscCallCUSPARSE(stat); 3232 3233 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3234 PetscCallCUSPARSE(stat); 3235 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3236 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3237 PetscCallCUSPARSE(stat); 3238 PetscCallCUDA(cudaFree(dBuffer3)); 3239 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3240 PetscCallCUSPARSE(stat); 3241 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3242 } 3243 #else 3244 size_t bufSize2; 3245 /* ask bufferSize bytes for external memory */ 3246 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3247 PetscCallCUSPARSE(stat); 3248 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3249 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3250 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3251 PetscCallCUSPARSE(stat); 3252 /* ask bufferSize again bytes for external memory */ 3253 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3254 PetscCallCUSPARSE(stat); 3255 /* The CUSPARSE documentation is not clear, nor the API 3256 We need both buffers to perform the operations properly! 3257 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3258 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3259 is stored in the descriptor! What a messy API... */ 3260 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3261 /* compute the intermediate product of A * B */ 3262 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3263 PetscCallCUSPARSE(stat); 3264 /* get matrix C non-zero entries C_nnz1 */ 3265 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3266 c->nz = (PetscInt)C_nnz1; 3267 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3268 mmdata->mmBufferSize / 1024)); 3269 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3270 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3271 Ccsr->values = new THRUSTARRAY(c->nz); 3272 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3273 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3274 PetscCallCUSPARSE(stat); 3275 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3276 PetscCallCUSPARSE(stat); 3277 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3278 #else 3279 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3280 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3281 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3282 PetscCallCUSPARSE(stat); 3283 c->nz = cnz; 3284 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3285 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3286 Ccsr->values = new THRUSTARRAY(c->nz); 3287 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3288 3289 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3290 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3291 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3292 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3293 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3294 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3295 PetscCallCUSPARSE(stat); 3296 #endif 3297 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3298 PetscCall(PetscLogGpuTimeEnd()); 3299 finalizesym: 3300 c->singlemalloc = PETSC_FALSE; 3301 c->free_a = PETSC_TRUE; 3302 c->free_ij = PETSC_TRUE; 3303 PetscCall(PetscMalloc1(m + 1, &c->i)); 3304 PetscCall(PetscMalloc1(c->nz, &c->j)); 3305 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3306 PetscInt *d_i = c->i; 3307 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3308 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3309 ii = *Ccsr->row_offsets; 3310 jj = *Ccsr->column_indices; 3311 if (ciscompressed) d_i = c->compressedrow.i; 3312 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3313 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3314 } else { 3315 PetscInt *d_i = c->i; 3316 if (ciscompressed) d_i = c->compressedrow.i; 3317 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3318 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3319 } 3320 if (ciscompressed) { /* need to expand host row offsets */ 3321 PetscInt r = 0; 3322 c->i[0] = 0; 3323 for (k = 0; k < c->compressedrow.nrows; k++) { 3324 const PetscInt next = c->compressedrow.rindex[k]; 3325 const PetscInt old = c->compressedrow.i[k]; 3326 for (; r < next; r++) c->i[r + 1] = old; 3327 } 3328 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3329 } 3330 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3331 PetscCall(PetscMalloc1(m, &c->ilen)); 3332 PetscCall(PetscMalloc1(m, &c->imax)); 3333 c->maxnz = c->nz; 3334 c->nonzerorowcnt = 0; 3335 c->rmax = 0; 3336 for (k = 0; k < m; k++) { 3337 const PetscInt nn = c->i[k + 1] - c->i[k]; 3338 c->ilen[k] = c->imax[k] = nn; 3339 c->nonzerorowcnt += (PetscInt) !!nn; 3340 c->rmax = PetscMax(c->rmax, nn); 3341 } 3342 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3343 PetscCall(PetscMalloc1(c->nz, &c->a)); 3344 Ccsr->num_entries = c->nz; 3345 3346 C->nonzerostate++; 3347 PetscCall(PetscLayoutSetUp(C->rmap)); 3348 PetscCall(PetscLayoutSetUp(C->cmap)); 3349 Ccusp->nonzerostate = C->nonzerostate; 3350 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3351 C->preallocated = PETSC_TRUE; 3352 C->assembled = PETSC_FALSE; 3353 C->was_assembled = PETSC_FALSE; 3354 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3355 mmdata->reusesym = PETSC_TRUE; 3356 C->offloadmask = PETSC_OFFLOAD_GPU; 3357 } 3358 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3359 PetscFunctionReturn(PETSC_SUCCESS); 3360 } 3361 3362 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3363 3364 /* handles sparse or dense B */ 3365 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3366 { 3367 Mat_Product *product = mat->product; 3368 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3369 3370 PetscFunctionBegin; 3371 MatCheckProduct(mat, 1); 3372 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3373 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3374 if (product->type == MATPRODUCT_ABC) { 3375 Ciscusp = PETSC_FALSE; 3376 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3377 } 3378 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3379 PetscBool usecpu = PETSC_FALSE; 3380 switch (product->type) { 3381 case MATPRODUCT_AB: 3382 if (product->api_user) { 3383 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3384 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3385 PetscOptionsEnd(); 3386 } else { 3387 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3388 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3389 PetscOptionsEnd(); 3390 } 3391 break; 3392 case MATPRODUCT_AtB: 3393 if (product->api_user) { 3394 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3395 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3396 PetscOptionsEnd(); 3397 } else { 3398 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3399 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3400 PetscOptionsEnd(); 3401 } 3402 break; 3403 case MATPRODUCT_PtAP: 3404 if (product->api_user) { 3405 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3406 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3407 PetscOptionsEnd(); 3408 } else { 3409 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3410 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3411 PetscOptionsEnd(); 3412 } 3413 break; 3414 case MATPRODUCT_RARt: 3415 if (product->api_user) { 3416 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3417 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3418 PetscOptionsEnd(); 3419 } else { 3420 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3421 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3422 PetscOptionsEnd(); 3423 } 3424 break; 3425 case MATPRODUCT_ABC: 3426 if (product->api_user) { 3427 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3428 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3429 PetscOptionsEnd(); 3430 } else { 3431 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3432 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3433 PetscOptionsEnd(); 3434 } 3435 break; 3436 default: 3437 break; 3438 } 3439 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3440 } 3441 /* dispatch */ 3442 if (isdense) { 3443 switch (product->type) { 3444 case MATPRODUCT_AB: 3445 case MATPRODUCT_AtB: 3446 case MATPRODUCT_ABt: 3447 case MATPRODUCT_PtAP: 3448 case MATPRODUCT_RARt: 3449 if (product->A->boundtocpu) { 3450 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3451 } else { 3452 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3453 } 3454 break; 3455 case MATPRODUCT_ABC: 3456 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3457 break; 3458 default: 3459 break; 3460 } 3461 } else if (Biscusp && Ciscusp) { 3462 switch (product->type) { 3463 case MATPRODUCT_AB: 3464 case MATPRODUCT_AtB: 3465 case MATPRODUCT_ABt: 3466 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3467 break; 3468 case MATPRODUCT_PtAP: 3469 case MATPRODUCT_RARt: 3470 case MATPRODUCT_ABC: 3471 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3472 break; 3473 default: 3474 break; 3475 } 3476 } else { /* fallback for AIJ */ 3477 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3478 } 3479 PetscFunctionReturn(PETSC_SUCCESS); 3480 } 3481 3482 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3483 { 3484 PetscFunctionBegin; 3485 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3486 PetscFunctionReturn(PETSC_SUCCESS); 3487 } 3488 3489 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3490 { 3491 PetscFunctionBegin; 3492 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3493 PetscFunctionReturn(PETSC_SUCCESS); 3494 } 3495 3496 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3497 { 3498 PetscFunctionBegin; 3499 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3500 PetscFunctionReturn(PETSC_SUCCESS); 3501 } 3502 3503 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3504 { 3505 PetscFunctionBegin; 3506 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3507 PetscFunctionReturn(PETSC_SUCCESS); 3508 } 3509 3510 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3511 { 3512 PetscFunctionBegin; 3513 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3514 PetscFunctionReturn(PETSC_SUCCESS); 3515 } 3516 3517 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3518 { 3519 int i = blockIdx.x * blockDim.x + threadIdx.x; 3520 if (i < n) y[idx[i]] += x[i]; 3521 } 3522 3523 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3524 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3525 { 3526 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3527 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3528 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3529 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3530 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3531 PetscBool compressed; 3532 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3533 PetscInt nx, ny; 3534 #endif 3535 3536 PetscFunctionBegin; 3537 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3538 if (!a->nz) { 3539 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3540 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3541 PetscFunctionReturn(PETSC_SUCCESS); 3542 } 3543 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3544 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3545 if (!trans) { 3546 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3547 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3548 } else { 3549 if (herm || !A->form_explicit_transpose) { 3550 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3551 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3552 } else { 3553 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3554 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3555 } 3556 } 3557 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3558 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3559 3560 try { 3561 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3562 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3563 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3564 3565 PetscCall(PetscLogGpuTimeBegin()); 3566 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3567 /* z = A x + beta y. 3568 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3569 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3570 */ 3571 xptr = xarray; 3572 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3573 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3574 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3575 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3576 allocated to accommodate different uses. So we get the length info directly from mat. 3577 */ 3578 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3579 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3580 nx = mat->num_cols; 3581 ny = mat->num_rows; 3582 } 3583 #endif 3584 } else { 3585 /* z = A^T x + beta y 3586 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3587 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3588 */ 3589 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3590 dptr = zarray; 3591 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3592 if (compressed) { /* Scatter x to work vector */ 3593 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3594 3595 thrust::for_each( 3596 #if PetscDefined(HAVE_THRUST_ASYNC) 3597 thrust::cuda::par.on(PetscDefaultCudaStream), 3598 #endif 3599 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3600 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3601 } 3602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3603 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3604 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3605 nx = mat->num_rows; 3606 ny = mat->num_cols; 3607 } 3608 #endif 3609 } 3610 3611 /* csr_spmv does y = alpha op(A) x + beta y */ 3612 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3613 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3614 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3615 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3616 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3617 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3618 PetscCallCUSPARSE( 3619 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3620 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3621 3622 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3623 } else { 3624 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3625 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3626 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3627 } 3628 3629 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3630 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3631 #else 3632 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3633 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3634 #endif 3635 } else { 3636 if (cusparsestruct->nrows) { 3637 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3638 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3639 #else 3640 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3641 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3642 #endif 3643 } 3644 } 3645 PetscCall(PetscLogGpuTimeEnd()); 3646 3647 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3648 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3649 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3650 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3651 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3652 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3653 } 3654 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3655 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3656 } 3657 3658 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3659 if (compressed) { 3660 PetscCall(PetscLogGpuTimeBegin()); 3661 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3662 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3663 prevent that. So I just add a ScatterAdd kernel. 3664 */ 3665 #if 0 3666 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3667 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3668 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3669 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3670 VecCUDAPlusEquals()); 3671 #else 3672 PetscInt n = matstruct->cprowIndices->size(); 3673 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3674 #endif 3675 PetscCall(PetscLogGpuTimeEnd()); 3676 } 3677 } else { 3678 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3679 } 3680 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3681 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3682 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3683 } catch (char *ex) { 3684 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3685 } 3686 if (yy) { 3687 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3688 } else { 3689 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3690 } 3691 PetscFunctionReturn(PETSC_SUCCESS); 3692 } 3693 3694 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3695 { 3696 PetscFunctionBegin; 3697 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3698 PetscFunctionReturn(PETSC_SUCCESS); 3699 } 3700 3701 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3702 { 3703 PetscFunctionBegin; 3704 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3705 PetscFunctionReturn(PETSC_SUCCESS); 3706 } 3707 3708 /*@ 3709 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3710 (the default parallel PETSc format). This matrix will ultimately pushed down 3711 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3712 assembly performance the user should preallocate the matrix storage by setting 3713 the parameter `nz` (or the array `nnz`). 3714 3715 Collective 3716 3717 Input Parameters: 3718 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3719 . m - number of rows 3720 . n - number of columns 3721 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3722 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3723 3724 Output Parameter: 3725 . A - the matrix 3726 3727 Level: intermediate 3728 3729 Notes: 3730 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3731 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3732 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3733 3734 The AIJ format, also called 3735 compressed row storage, is fully compatible with standard Fortran 3736 storage. That is, the stored row and column indices can begin at 3737 either one (as in Fortran) or zero. 3738 3739 Specify the preallocated storage with either nz or nnz (not both). 3740 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3741 allocation. 3742 3743 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3744 @*/ 3745 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3746 { 3747 PetscFunctionBegin; 3748 PetscCall(MatCreate(comm, A)); 3749 PetscCall(MatSetSizes(*A, m, n, m, n)); 3750 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3751 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3752 PetscFunctionReturn(PETSC_SUCCESS); 3753 } 3754 3755 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3756 { 3757 PetscFunctionBegin; 3758 if (A->factortype == MAT_FACTOR_NONE) { 3759 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 3760 } else { 3761 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3762 } 3763 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3764 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3765 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3766 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3767 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3768 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3769 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3770 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3771 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3772 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3773 PetscCall(MatDestroy_SeqAIJ(A)); 3774 PetscFunctionReturn(PETSC_SUCCESS); 3775 } 3776 3777 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3778 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3779 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3780 { 3781 PetscFunctionBegin; 3782 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3783 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3784 PetscFunctionReturn(PETSC_SUCCESS); 3785 } 3786 3787 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3788 { 3789 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3790 Mat_SeqAIJCUSPARSE *cy; 3791 Mat_SeqAIJCUSPARSE *cx; 3792 PetscScalar *ay; 3793 const PetscScalar *ax; 3794 CsrMatrix *csry, *csrx; 3795 3796 PetscFunctionBegin; 3797 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3798 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3799 if (X->ops->axpy != Y->ops->axpy) { 3800 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3801 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3802 PetscFunctionReturn(PETSC_SUCCESS); 3803 } 3804 /* if we are here, it means both matrices are bound to GPU */ 3805 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3806 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3807 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3808 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3809 csry = (CsrMatrix *)cy->mat->mat; 3810 csrx = (CsrMatrix *)cx->mat->mat; 3811 /* see if we can turn this into a cublas axpy */ 3812 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3813 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3814 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3815 if (eq) str = SAME_NONZERO_PATTERN; 3816 } 3817 /* spgeam is buggy with one column */ 3818 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3819 3820 if (str == SUBSET_NONZERO_PATTERN) { 3821 PetscScalar b = 1.0; 3822 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3823 size_t bufferSize; 3824 void *buffer; 3825 #endif 3826 3827 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3828 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3829 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3830 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3831 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3832 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3833 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3834 PetscCall(PetscLogGpuTimeBegin()); 3835 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3836 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3837 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3838 PetscCall(PetscLogGpuTimeEnd()); 3839 PetscCallCUDA(cudaFree(buffer)); 3840 #else 3841 PetscCall(PetscLogGpuTimeBegin()); 3842 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3843 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3844 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3845 PetscCall(PetscLogGpuTimeEnd()); 3846 #endif 3847 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3848 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3849 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3850 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3851 } else if (str == SAME_NONZERO_PATTERN) { 3852 cublasHandle_t cublasv2handle; 3853 PetscBLASInt one = 1, bnz = 1; 3854 3855 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3856 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3857 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3858 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3859 PetscCall(PetscLogGpuTimeBegin()); 3860 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3861 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3862 PetscCall(PetscLogGpuTimeEnd()); 3863 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3864 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3865 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3866 } else { 3867 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3868 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3869 } 3870 PetscFunctionReturn(PETSC_SUCCESS); 3871 } 3872 3873 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3874 { 3875 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3876 PetscScalar *ay; 3877 cublasHandle_t cublasv2handle; 3878 PetscBLASInt one = 1, bnz = 1; 3879 3880 PetscFunctionBegin; 3881 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3882 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3883 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3884 PetscCall(PetscLogGpuTimeBegin()); 3885 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3886 PetscCall(PetscLogGpuFlops(bnz)); 3887 PetscCall(PetscLogGpuTimeEnd()); 3888 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3889 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3890 PetscFunctionReturn(PETSC_SUCCESS); 3891 } 3892 3893 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3894 { 3895 PetscBool both = PETSC_FALSE; 3896 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3897 3898 PetscFunctionBegin; 3899 if (A->factortype == MAT_FACTOR_NONE) { 3900 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3901 if (spptr->mat) { 3902 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3903 if (matrix->values) { 3904 both = PETSC_TRUE; 3905 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3906 } 3907 } 3908 if (spptr->matTranspose) { 3909 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3910 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3911 } 3912 } 3913 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3914 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3915 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3916 else A->offloadmask = PETSC_OFFLOAD_CPU; 3917 PetscFunctionReturn(PETSC_SUCCESS); 3918 } 3919 3920 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3921 { 3922 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3923 3924 PetscFunctionBegin; 3925 if (A->factortype != MAT_FACTOR_NONE) { 3926 A->boundtocpu = flg; 3927 PetscFunctionReturn(PETSC_SUCCESS); 3928 } 3929 if (flg) { 3930 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3931 3932 A->ops->scale = MatScale_SeqAIJ; 3933 A->ops->axpy = MatAXPY_SeqAIJ; 3934 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3935 A->ops->mult = MatMult_SeqAIJ; 3936 A->ops->multadd = MatMultAdd_SeqAIJ; 3937 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3938 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3939 A->ops->multhermitiantranspose = NULL; 3940 A->ops->multhermitiantransposeadd = NULL; 3941 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3942 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3943 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3944 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3945 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3946 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3947 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3948 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3949 } else { 3950 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3951 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3952 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3953 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3954 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3955 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3956 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3957 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3958 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3959 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3960 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3961 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3962 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3963 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3964 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3965 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3966 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3967 3968 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3969 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3970 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3971 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3972 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3973 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3974 } 3975 A->boundtocpu = flg; 3976 if (flg && a->inode.size) { 3977 a->inode.use = PETSC_TRUE; 3978 } else { 3979 a->inode.use = PETSC_FALSE; 3980 } 3981 PetscFunctionReturn(PETSC_SUCCESS); 3982 } 3983 3984 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 3985 { 3986 Mat B; 3987 3988 PetscFunctionBegin; 3989 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3990 if (reuse == MAT_INITIAL_MATRIX) { 3991 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3992 } else if (reuse == MAT_REUSE_MATRIX) { 3993 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3994 } 3995 B = *newmat; 3996 3997 PetscCall(PetscFree(B->defaultvectype)); 3998 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 3999 4000 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4001 if (B->factortype == MAT_FACTOR_NONE) { 4002 Mat_SeqAIJCUSPARSE *spptr; 4003 PetscCall(PetscNew(&spptr)); 4004 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4005 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4006 spptr->format = MAT_CUSPARSE_CSR; 4007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4008 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4009 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4010 #else 4011 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4012 #endif 4013 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4014 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4015 #endif 4016 B->spptr = spptr; 4017 } else { 4018 Mat_SeqAIJCUSPARSETriFactors *spptr; 4019 4020 PetscCall(PetscNew(&spptr)); 4021 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4022 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4023 B->spptr = spptr; 4024 } 4025 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4026 } 4027 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4028 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4029 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4030 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4031 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4032 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4033 4034 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4035 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4036 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4037 #if defined(PETSC_HAVE_HYPRE) 4038 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4039 #endif 4040 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4041 PetscFunctionReturn(PETSC_SUCCESS); 4042 } 4043 4044 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4045 { 4046 PetscFunctionBegin; 4047 PetscCall(MatCreate_SeqAIJ(B)); 4048 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4049 PetscFunctionReturn(PETSC_SUCCESS); 4050 } 4051 4052 /*MC 4053 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4054 4055 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 4056 CSR, ELL, or Hybrid format. 4057 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4058 4059 Options Database Keys: 4060 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4061 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4062 Other options include ell (ellpack) or hyb (hybrid). 4063 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4064 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4065 4066 Level: beginner 4067 4068 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4069 M*/ 4070 4071 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4072 { 4073 PetscFunctionBegin; 4074 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4075 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4076 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4077 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4078 4079 PetscFunctionReturn(PETSC_SUCCESS); 4080 } 4081 4082 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 4083 { 4084 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 4085 4086 PetscFunctionBegin; 4087 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4088 delete cusp->cooPerm; 4089 delete cusp->cooPerm_a; 4090 cusp->cooPerm = NULL; 4091 cusp->cooPerm_a = NULL; 4092 if (cusp->use_extended_coo) { 4093 PetscCallCUDA(cudaFree(cusp->jmap_d)); 4094 PetscCallCUDA(cudaFree(cusp->perm_d)); 4095 } 4096 cusp->use_extended_coo = PETSC_FALSE; 4097 PetscFunctionReturn(PETSC_SUCCESS); 4098 } 4099 4100 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 4101 { 4102 PetscFunctionBegin; 4103 if (*cusparsestruct) { 4104 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 4105 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 4106 delete (*cusparsestruct)->workVector; 4107 delete (*cusparsestruct)->rowoffsets_gpu; 4108 delete (*cusparsestruct)->cooPerm; 4109 delete (*cusparsestruct)->cooPerm_a; 4110 delete (*cusparsestruct)->csr2csc_i; 4111 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 4112 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 4113 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 4114 PetscCall(PetscFree(*cusparsestruct)); 4115 } 4116 PetscFunctionReturn(PETSC_SUCCESS); 4117 } 4118 4119 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4120 { 4121 PetscFunctionBegin; 4122 if (*mat) { 4123 delete (*mat)->values; 4124 delete (*mat)->column_indices; 4125 delete (*mat)->row_offsets; 4126 delete *mat; 4127 *mat = 0; 4128 } 4129 PetscFunctionReturn(PETSC_SUCCESS); 4130 } 4131 4132 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4133 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4134 { 4135 PetscFunctionBegin; 4136 if (*trifactor) { 4137 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4138 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4139 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4140 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4141 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4142 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4143 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4144 #endif 4145 PetscCall(PetscFree(*trifactor)); 4146 } 4147 PetscFunctionReturn(PETSC_SUCCESS); 4148 } 4149 #endif 4150 4151 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4152 { 4153 CsrMatrix *mat; 4154 4155 PetscFunctionBegin; 4156 if (*matstruct) { 4157 if ((*matstruct)->mat) { 4158 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4160 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4161 #else 4162 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4163 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4164 #endif 4165 } else { 4166 mat = (CsrMatrix *)(*matstruct)->mat; 4167 PetscCall(CsrMatrix_Destroy(&mat)); 4168 } 4169 } 4170 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4171 delete (*matstruct)->cprowIndices; 4172 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4173 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4174 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4175 4176 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4177 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4178 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4179 for (int i = 0; i < 3; i++) { 4180 if (mdata->cuSpMV[i].initialized) { 4181 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4182 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4183 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4184 } 4185 } 4186 #endif 4187 delete *matstruct; 4188 *matstruct = NULL; 4189 } 4190 PetscFunctionReturn(PETSC_SUCCESS); 4191 } 4192 4193 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4194 { 4195 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4196 4197 PetscFunctionBegin; 4198 if (fs) { 4199 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4200 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4201 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4202 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4203 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4204 delete fs->workVector; 4205 fs->workVector = NULL; 4206 #endif 4207 delete fs->rpermIndices; 4208 delete fs->cpermIndices; 4209 fs->rpermIndices = NULL; 4210 fs->cpermIndices = NULL; 4211 fs->init_dev_prop = PETSC_FALSE; 4212 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4213 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4214 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4215 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4216 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4217 PetscCallCUDA(cudaFree(fs->csrVal)); 4218 PetscCallCUDA(cudaFree(fs->diag)); 4219 PetscCallCUDA(cudaFree(fs->X)); 4220 PetscCallCUDA(cudaFree(fs->Y)); 4221 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4222 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4223 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4224 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4225 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4226 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4227 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4228 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4229 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4230 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4231 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4232 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4233 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4234 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4235 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4236 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4237 PetscCall(PetscFree(fs->csrRowPtr_h)); 4238 PetscCall(PetscFree(fs->csrVal_h)); 4239 PetscCall(PetscFree(fs->diag_h)); 4240 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4241 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4242 #endif 4243 } 4244 PetscFunctionReturn(PETSC_SUCCESS); 4245 } 4246 4247 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4248 { 4249 PetscFunctionBegin; 4250 if (*trifactors) { 4251 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4252 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4253 PetscCall(PetscFree(*trifactors)); 4254 } 4255 PetscFunctionReturn(PETSC_SUCCESS); 4256 } 4257 4258 struct IJCompare { 4259 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4260 { 4261 if (t1.get<0>() < t2.get<0>()) return true; 4262 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4263 return false; 4264 } 4265 }; 4266 4267 struct IJEqual { 4268 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4269 { 4270 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 4271 return true; 4272 } 4273 }; 4274 4275 struct IJDiff { 4276 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 4277 }; 4278 4279 struct IJSum { 4280 __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 4281 }; 4282 4283 #include <thrust/iterator/discard_iterator.h> 4284 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4285 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4286 { 4287 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4288 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4289 THRUSTARRAY *cooPerm_v = NULL; 4290 thrust::device_ptr<const PetscScalar> d_v; 4291 CsrMatrix *matrix; 4292 PetscInt n; 4293 4294 PetscFunctionBegin; 4295 PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 4296 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 4297 if (!cusp->cooPerm) { 4298 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 4299 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 4300 PetscFunctionReturn(PETSC_SUCCESS); 4301 } 4302 matrix = (CsrMatrix *)cusp->mat->mat; 4303 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4304 if (!v) { 4305 if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4306 goto finalize; 4307 } 4308 n = cusp->cooPerm->size(); 4309 if (isCudaMem(v)) { 4310 d_v = thrust::device_pointer_cast(v); 4311 } else { 4312 cooPerm_v = new THRUSTARRAY(n); 4313 cooPerm_v->assign(v, v + n); 4314 d_v = cooPerm_v->data(); 4315 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4316 } 4317 PetscCall(PetscLogGpuTimeBegin()); 4318 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4319 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4320 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 4321 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4322 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4323 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4324 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4325 */ 4326 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4327 thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4328 delete cooPerm_w; 4329 } else { 4330 /* all nonzeros in d_v[] are unique entries */ 4331 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4332 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4333 thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 4334 } 4335 } else { 4336 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 4337 auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4338 thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4339 } else { 4340 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 4341 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4342 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4343 } 4344 } 4345 PetscCall(PetscLogGpuTimeEnd()); 4346 finalize: 4347 delete cooPerm_v; 4348 A->offloadmask = PETSC_OFFLOAD_GPU; 4349 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4350 /* shorter version of MatAssemblyEnd_SeqAIJ */ 4351 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 4352 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 4353 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4354 a->reallocs = 0; 4355 A->info.mallocs += 0; 4356 A->info.nz_unneeded = 0; 4357 A->assembled = A->was_assembled = PETSC_TRUE; 4358 A->num_ass++; 4359 PetscFunctionReturn(PETSC_SUCCESS); 4360 } 4361 4362 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4363 { 4364 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4365 4366 PetscFunctionBegin; 4367 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4368 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4369 if (destroy) { 4370 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4371 delete cusp->csr2csc_i; 4372 cusp->csr2csc_i = NULL; 4373 } 4374 A->transupdated = PETSC_FALSE; 4375 PetscFunctionReturn(PETSC_SUCCESS); 4376 } 4377 4378 #include <thrust/binary_search.h> 4379 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4380 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4381 { 4382 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4383 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4384 PetscInt cooPerm_n, nzr = 0; 4385 4386 PetscFunctionBegin; 4387 PetscCall(PetscLayoutSetUp(A->rmap)); 4388 PetscCall(PetscLayoutSetUp(A->cmap)); 4389 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4390 if (n != cooPerm_n) { 4391 delete cusp->cooPerm; 4392 delete cusp->cooPerm_a; 4393 cusp->cooPerm = NULL; 4394 cusp->cooPerm_a = NULL; 4395 } 4396 if (n) { 4397 thrust::device_ptr<PetscInt> d_i, d_j; 4398 PetscInt *d_raw_i, *d_raw_j; 4399 PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4400 PetscMemType imtype, jmtype; 4401 4402 PetscCall(PetscGetMemType(coo_i, &imtype)); 4403 if (PetscMemTypeHost(imtype)) { 4404 PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4405 PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4406 d_i = thrust::device_pointer_cast(d_raw_i); 4407 free_raw_i = PETSC_TRUE; 4408 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4409 } else { 4410 d_i = thrust::device_pointer_cast(coo_i); 4411 } 4412 4413 PetscCall(PetscGetMemType(coo_j, &jmtype)); 4414 if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4415 PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4416 PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4417 d_j = thrust::device_pointer_cast(d_raw_j); 4418 free_raw_j = PETSC_TRUE; 4419 PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4420 } else { 4421 d_j = thrust::device_pointer_cast(coo_j); 4422 } 4423 4424 THRUSTINTARRAY ii(A->rmap->n); 4425 4426 if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4427 if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 4428 4429 /* Ex. 4430 n = 6 4431 coo_i = [3,3,1,4,1,4] 4432 coo_j = [3,2,2,5,2,6] 4433 */ 4434 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4435 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 4436 4437 PetscCall(PetscLogGpuTimeBegin()); 4438 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4439 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4440 (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4441 THRUSTINTARRAY w(d_j, d_j + n); 4442 4443 /* 4444 d_i = [1,1,3,3,4,4] 4445 d_j = [2,2,2,3,5,6] 4446 cooPerm = [2,4,1,0,3,5] 4447 */ 4448 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4449 4450 /* 4451 d_i = [1,3,3,4,4,x] 4452 ^ekey 4453 d_j = [2,2,3,5,6,x] 4454 ^nekye 4455 */ 4456 if (nekey == ekey) { /* all entries are unique */ 4457 delete cusp->cooPerm_a; 4458 cusp->cooPerm_a = NULL; 4459 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4460 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4461 adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4462 adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4463 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4464 w[0] = 0; 4465 thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4466 thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4467 } 4468 thrust::counting_iterator<PetscInt> search_begin(0); 4469 thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4470 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4471 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4472 PetscCall(PetscLogGpuTimeEnd()); 4473 4474 PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 4475 a->singlemalloc = PETSC_FALSE; 4476 a->free_a = PETSC_TRUE; 4477 a->free_ij = PETSC_TRUE; 4478 PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4479 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4480 PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4481 a->nz = a->maxnz = a->i[A->rmap->n]; 4482 a->rmax = 0; 4483 PetscCall(PetscMalloc1(a->nz, &a->a)); 4484 PetscCall(PetscMalloc1(a->nz, &a->j)); 4485 PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4486 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 4487 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 4488 for (PetscInt i = 0; i < A->rmap->n; i++) { 4489 const PetscInt nnzr = a->i[i + 1] - a->i[i]; 4490 nzr += (PetscInt) !!(nnzr); 4491 a->ilen[i] = a->imax[i] = nnzr; 4492 a->rmax = PetscMax(a->rmax, nnzr); 4493 } 4494 a->nonzerorowcnt = nzr; 4495 A->preallocated = PETSC_TRUE; 4496 PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 4497 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4498 if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4499 if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 4500 } else { 4501 PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 4502 } 4503 PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 4504 4505 /* We want to allocate the CUSPARSE struct for matvec now. 4506 The code is so convoluted now that I prefer to copy zeros */ 4507 PetscCall(PetscArrayzero(a->a, a->nz)); 4508 PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 4509 A->offloadmask = PETSC_OFFLOAD_CPU; 4510 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4511 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 4512 PetscFunctionReturn(PETSC_SUCCESS); 4513 } 4514 4515 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4516 { 4517 Mat_SeqAIJ *seq; 4518 Mat_SeqAIJCUSPARSE *dev; 4519 PetscBool coo_basic = PETSC_TRUE; 4520 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4521 4522 PetscFunctionBegin; 4523 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4524 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4525 if (coo_i) { 4526 PetscCall(PetscGetMemType(coo_i, &mtype)); 4527 if (PetscMemTypeHost(mtype)) { 4528 for (PetscCount k = 0; k < coo_n; k++) { 4529 if (coo_i[k] < 0 || coo_j[k] < 0) { 4530 coo_basic = PETSC_FALSE; 4531 break; 4532 } 4533 } 4534 } 4535 } 4536 4537 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4538 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4539 } else { 4540 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4541 mat->offloadmask = PETSC_OFFLOAD_CPU; 4542 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4543 seq = static_cast<Mat_SeqAIJ *>(mat->data); 4544 dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4545 PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 4546 PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4547 PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 4548 PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4549 dev->use_extended_coo = PETSC_TRUE; 4550 } 4551 PetscFunctionReturn(PETSC_SUCCESS); 4552 } 4553 4554 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4555 { 4556 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4557 const PetscCount grid_size = gridDim.x * blockDim.x; 4558 for (; i < nnz; i += grid_size) { 4559 PetscScalar sum = 0.0; 4560 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4561 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4562 } 4563 } 4564 4565 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4566 { 4567 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4568 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4569 PetscCount Annz = seq->nz; 4570 PetscMemType memtype; 4571 const PetscScalar *v1 = v; 4572 PetscScalar *Aa; 4573 4574 PetscFunctionBegin; 4575 if (dev->use_extended_coo) { 4576 PetscCall(PetscGetMemType(v, &memtype)); 4577 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4578 PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 4579 PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4580 } 4581 4582 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4583 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4584 4585 if (Annz) { 4586 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 4587 PetscCallCUDA(cudaPeekAtLastError()); 4588 } 4589 4590 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4591 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4592 4593 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4594 } else { 4595 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4596 } 4597 PetscFunctionReturn(PETSC_SUCCESS); 4598 } 4599 4600 /*@C 4601 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4602 4603 Not Collective 4604 4605 Input Parameters: 4606 + A - the matrix 4607 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4608 4609 Output Parameters: 4610 + i - the CSR row pointers 4611 - j - the CSR column indices 4612 4613 Level: developer 4614 4615 Note: 4616 When compressed is true, the CSR structure does not contain empty rows 4617 4618 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4619 @*/ 4620 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4621 { 4622 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4623 CsrMatrix *csr; 4624 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4625 4626 PetscFunctionBegin; 4627 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4628 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4629 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4630 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4631 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4632 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4633 csr = (CsrMatrix *)cusp->mat->mat; 4634 if (i) { 4635 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4636 if (!cusp->rowoffsets_gpu) { 4637 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4638 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4639 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4640 } 4641 *i = cusp->rowoffsets_gpu->data().get(); 4642 } else *i = csr->row_offsets->data().get(); 4643 } 4644 if (j) *j = csr->column_indices->data().get(); 4645 PetscFunctionReturn(PETSC_SUCCESS); 4646 } 4647 4648 /*@C 4649 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4650 4651 Not Collective 4652 4653 Input Parameters: 4654 + A - the matrix 4655 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4656 . i - the CSR row pointers 4657 - j - the CSR column indices 4658 4659 Level: developer 4660 4661 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4662 @*/ 4663 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4664 { 4665 PetscFunctionBegin; 4666 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4667 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4668 if (i) *i = NULL; 4669 if (j) *j = NULL; 4670 (void)compressed; 4671 PetscFunctionReturn(PETSC_SUCCESS); 4672 } 4673 4674 /*@C 4675 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4676 4677 Not Collective 4678 4679 Input Parameter: 4680 . A - a `MATSEQAIJCUSPARSE` matrix 4681 4682 Output Parameter: 4683 . a - pointer to the device data 4684 4685 Level: developer 4686 4687 Note: 4688 May trigger host-device copies if up-to-date matrix data is on host 4689 4690 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4691 @*/ 4692 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4693 { 4694 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4695 CsrMatrix *csr; 4696 4697 PetscFunctionBegin; 4698 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4699 PetscValidPointer(a, 2); 4700 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4701 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4702 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4703 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4704 csr = (CsrMatrix *)cusp->mat->mat; 4705 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4706 *a = csr->values->data().get(); 4707 PetscFunctionReturn(PETSC_SUCCESS); 4708 } 4709 4710 /*@C 4711 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4712 4713 Not Collective 4714 4715 Input Parameters: 4716 + A - a `MATSEQAIJCUSPARSE` matrix 4717 - a - pointer to the device data 4718 4719 Level: developer 4720 4721 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4722 @*/ 4723 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4724 { 4725 PetscFunctionBegin; 4726 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4727 PetscValidPointer(a, 2); 4728 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4729 *a = NULL; 4730 PetscFunctionReturn(PETSC_SUCCESS); 4731 } 4732 4733 /*@C 4734 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4735 4736 Not Collective 4737 4738 Input Parameter: 4739 . A - a `MATSEQAIJCUSPARSE` matrix 4740 4741 Output Parameter: 4742 . a - pointer to the device data 4743 4744 Level: developer 4745 4746 Note: 4747 May trigger host-device copies if up-to-date matrix data is on host 4748 4749 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4750 @*/ 4751 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4752 { 4753 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4754 CsrMatrix *csr; 4755 4756 PetscFunctionBegin; 4757 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4758 PetscValidPointer(a, 2); 4759 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4760 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4761 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4762 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4763 csr = (CsrMatrix *)cusp->mat->mat; 4764 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4765 *a = csr->values->data().get(); 4766 A->offloadmask = PETSC_OFFLOAD_GPU; 4767 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4768 PetscFunctionReturn(PETSC_SUCCESS); 4769 } 4770 /*@C 4771 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4772 4773 Not Collective 4774 4775 Input Parameters: 4776 + A - a `MATSEQAIJCUSPARSE` matrix 4777 - a - pointer to the device data 4778 4779 Level: developer 4780 4781 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4782 @*/ 4783 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4784 { 4785 PetscFunctionBegin; 4786 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4787 PetscValidPointer(a, 2); 4788 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4789 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4790 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4791 *a = NULL; 4792 PetscFunctionReturn(PETSC_SUCCESS); 4793 } 4794 4795 /*@C 4796 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4797 4798 Not Collective 4799 4800 Input Parameter: 4801 . A - a `MATSEQAIJCUSPARSE` matrix 4802 4803 Output Parameter: 4804 . a - pointer to the device data 4805 4806 Level: developer 4807 4808 Note: 4809 Does not trigger host-device copies and flags data validity on the GPU 4810 4811 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4812 @*/ 4813 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4814 { 4815 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4816 CsrMatrix *csr; 4817 4818 PetscFunctionBegin; 4819 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4820 PetscValidPointer(a, 2); 4821 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4822 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4823 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4824 csr = (CsrMatrix *)cusp->mat->mat; 4825 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4826 *a = csr->values->data().get(); 4827 A->offloadmask = PETSC_OFFLOAD_GPU; 4828 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4829 PetscFunctionReturn(PETSC_SUCCESS); 4830 } 4831 4832 /*@C 4833 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4834 4835 Not Collective 4836 4837 Input Parameters: 4838 + A - a `MATSEQAIJCUSPARSE` matrix 4839 - a - pointer to the device data 4840 4841 Level: developer 4842 4843 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4844 @*/ 4845 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4846 { 4847 PetscFunctionBegin; 4848 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4849 PetscValidPointer(a, 2); 4850 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4851 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4852 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4853 *a = NULL; 4854 PetscFunctionReturn(PETSC_SUCCESS); 4855 } 4856 4857 struct IJCompare4 { 4858 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4859 { 4860 if (t1.get<0>() < t2.get<0>()) return true; 4861 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4862 return false; 4863 } 4864 }; 4865 4866 struct Shift { 4867 int _shift; 4868 4869 Shift(int shift) : _shift(shift) { } 4870 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4871 }; 4872 4873 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4874 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4875 { 4876 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4877 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4878 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4879 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4880 PetscInt Annz, Bnnz; 4881 cusparseStatus_t stat; 4882 PetscInt i, m, n, zero = 0; 4883 4884 PetscFunctionBegin; 4885 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4886 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4887 PetscValidPointer(C, 4); 4888 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4889 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4890 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4891 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4892 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4893 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4894 if (reuse == MAT_INITIAL_MATRIX) { 4895 m = A->rmap->n; 4896 n = A->cmap->n + B->cmap->n; 4897 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4898 PetscCall(MatSetSizes(*C, m, n, m, n)); 4899 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4900 c = (Mat_SeqAIJ *)(*C)->data; 4901 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4902 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4903 Ccsr = new CsrMatrix; 4904 Cmat->cprowIndices = NULL; 4905 c->compressedrow.use = PETSC_FALSE; 4906 c->compressedrow.nrows = 0; 4907 c->compressedrow.i = NULL; 4908 c->compressedrow.rindex = NULL; 4909 Ccusp->workVector = NULL; 4910 Ccusp->nrows = m; 4911 Ccusp->mat = Cmat; 4912 Ccusp->mat->mat = Ccsr; 4913 Ccsr->num_rows = m; 4914 Ccsr->num_cols = n; 4915 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4916 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4917 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4918 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 4919 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 4920 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4921 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4922 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4923 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4924 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4925 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4926 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4927 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4928 4929 Acsr = (CsrMatrix *)Acusp->mat->mat; 4930 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4931 Annz = (PetscInt)Acsr->column_indices->size(); 4932 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4933 c->nz = Annz + Bnnz; 4934 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4935 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4936 Ccsr->values = new THRUSTARRAY(c->nz); 4937 Ccsr->num_entries = c->nz; 4938 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4939 if (c->nz) { 4940 auto Acoo = new THRUSTINTARRAY32(Annz); 4941 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4942 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4943 THRUSTINTARRAY32 *Aroff, *Broff; 4944 4945 if (a->compressedrow.use) { /* need full row offset */ 4946 if (!Acusp->rowoffsets_gpu) { 4947 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4948 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4949 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4950 } 4951 Aroff = Acusp->rowoffsets_gpu; 4952 } else Aroff = Acsr->row_offsets; 4953 if (b->compressedrow.use) { /* need full row offset */ 4954 if (!Bcusp->rowoffsets_gpu) { 4955 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4956 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4957 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4958 } 4959 Broff = Bcusp->rowoffsets_gpu; 4960 } else Broff = Bcsr->row_offsets; 4961 PetscCall(PetscLogGpuTimeBegin()); 4962 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4963 PetscCallCUSPARSE(stat); 4964 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4965 PetscCallCUSPARSE(stat); 4966 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4967 auto Aperm = thrust::make_constant_iterator(1); 4968 auto Bperm = thrust::make_constant_iterator(0); 4969 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4970 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4971 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4972 #else 4973 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4974 auto Bcib = Bcsr->column_indices->begin(); 4975 auto Bcie = Bcsr->column_indices->end(); 4976 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4977 #endif 4978 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4979 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4980 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4981 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4982 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4983 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4984 auto p1 = Ccusp->cooPerm->begin(); 4985 auto p2 = Ccusp->cooPerm->begin(); 4986 thrust::advance(p2, Annz); 4987 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4988 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4989 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4990 #endif 4991 auto cci = thrust::make_counting_iterator(zero); 4992 auto cce = thrust::make_counting_iterator(c->nz); 4993 #if 0 //Errors on SUMMIT cuda 11.1.0 4994 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4995 #else 4996 auto pred = thrust::identity<int>(); 4997 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4998 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4999 #endif 5000 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 5001 PetscCallCUSPARSE(stat); 5002 PetscCall(PetscLogGpuTimeEnd()); 5003 delete wPerm; 5004 delete Acoo; 5005 delete Bcoo; 5006 delete Ccoo; 5007 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 5008 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 5009 PetscCallCUSPARSE(stat); 5010 #endif 5011 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 5012 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 5013 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 5014 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5015 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 5016 CsrMatrix *CcsrT = new CsrMatrix; 5017 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 5018 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 5019 5020 (*C)->form_explicit_transpose = PETSC_TRUE; 5021 (*C)->transupdated = PETSC_TRUE; 5022 Ccusp->rowoffsets_gpu = NULL; 5023 CmatT->cprowIndices = NULL; 5024 CmatT->mat = CcsrT; 5025 CcsrT->num_rows = n; 5026 CcsrT->num_cols = m; 5027 CcsrT->num_entries = c->nz; 5028 5029 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 5030 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 5031 CcsrT->values = new THRUSTARRAY(c->nz); 5032 5033 PetscCall(PetscLogGpuTimeBegin()); 5034 auto rT = CcsrT->row_offsets->begin(); 5035 if (AT) { 5036 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 5037 thrust::advance(rT, -1); 5038 } 5039 if (BT) { 5040 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 5041 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 5042 thrust::copy(titb, tite, rT); 5043 } 5044 auto cT = CcsrT->column_indices->begin(); 5045 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 5046 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 5047 auto vT = CcsrT->values->begin(); 5048 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 5049 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 5050 PetscCall(PetscLogGpuTimeEnd()); 5051 5052 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 5053 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 5054 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 5055 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 5056 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 5057 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 5058 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 5059 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 5060 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 5061 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 5062 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 5063 PetscCallCUSPARSE(stat); 5064 #endif 5065 Ccusp->matTranspose = CmatT; 5066 } 5067 } 5068 5069 c->singlemalloc = PETSC_FALSE; 5070 c->free_a = PETSC_TRUE; 5071 c->free_ij = PETSC_TRUE; 5072 PetscCall(PetscMalloc1(m + 1, &c->i)); 5073 PetscCall(PetscMalloc1(c->nz, &c->j)); 5074 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 5075 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 5076 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 5077 ii = *Ccsr->row_offsets; 5078 jj = *Ccsr->column_indices; 5079 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5080 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5081 } else { 5082 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5083 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5084 } 5085 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 5086 PetscCall(PetscMalloc1(m, &c->ilen)); 5087 PetscCall(PetscMalloc1(m, &c->imax)); 5088 c->maxnz = c->nz; 5089 c->nonzerorowcnt = 0; 5090 c->rmax = 0; 5091 for (i = 0; i < m; i++) { 5092 const PetscInt nn = c->i[i + 1] - c->i[i]; 5093 c->ilen[i] = c->imax[i] = nn; 5094 c->nonzerorowcnt += (PetscInt) !!nn; 5095 c->rmax = PetscMax(c->rmax, nn); 5096 } 5097 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 5098 PetscCall(PetscMalloc1(c->nz, &c->a)); 5099 (*C)->nonzerostate++; 5100 PetscCall(PetscLayoutSetUp((*C)->rmap)); 5101 PetscCall(PetscLayoutSetUp((*C)->cmap)); 5102 Ccusp->nonzerostate = (*C)->nonzerostate; 5103 (*C)->preallocated = PETSC_TRUE; 5104 } else { 5105 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 5106 c = (Mat_SeqAIJ *)(*C)->data; 5107 if (c->nz) { 5108 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 5109 PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 5110 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 5111 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 5112 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 5113 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 5114 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 5115 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 5116 Acsr = (CsrMatrix *)Acusp->mat->mat; 5117 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 5118 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 5119 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 5120 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 5121 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 5122 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 5123 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 5124 auto pmid = Ccusp->cooPerm->begin(); 5125 thrust::advance(pmid, Acsr->num_entries); 5126 PetscCall(PetscLogGpuTimeBegin()); 5127 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 5128 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 5129 thrust::for_each(zibait, zieait, VecCUDAEquals()); 5130 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 5131 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 5132 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 5133 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 5134 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 5135 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5136 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5137 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 5138 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 5139 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 5140 auto vT = CcsrT->values->begin(); 5141 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 5142 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 5143 (*C)->transupdated = PETSC_TRUE; 5144 } 5145 PetscCall(PetscLogGpuTimeEnd()); 5146 } 5147 } 5148 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5149 (*C)->assembled = PETSC_TRUE; 5150 (*C)->was_assembled = PETSC_FALSE; 5151 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5152 PetscFunctionReturn(PETSC_SUCCESS); 5153 } 5154 5155 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5156 { 5157 bool dmem; 5158 const PetscScalar *av; 5159 5160 PetscFunctionBegin; 5161 dmem = isCudaMem(v); 5162 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5163 if (n && idx) { 5164 THRUSTINTARRAY widx(n); 5165 widx.assign(idx, idx + n); 5166 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5167 5168 THRUSTARRAY *w = NULL; 5169 thrust::device_ptr<PetscScalar> dv; 5170 if (dmem) { 5171 dv = thrust::device_pointer_cast(v); 5172 } else { 5173 w = new THRUSTARRAY(n); 5174 dv = w->data(); 5175 } 5176 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5177 5178 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5179 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5180 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5181 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5182 delete w; 5183 } else { 5184 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5185 } 5186 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5187 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5188 PetscFunctionReturn(PETSC_SUCCESS); 5189 } 5190