1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #endif 19 #include <thrust/iterator/constant_iterator.h> 20 #include <thrust/remove.h> 21 #include <thrust/sort.h> 22 #include <thrust/unique.h> 23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST) 24 #include <cuda/std/functional> 25 #endif 26 27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 29 /* 30 The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 31 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 32 */ 33 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 34 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 36 #endif 37 38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 48 #endif 49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 59 60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 64 65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 67 68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 71 72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 73 { 74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 75 76 PetscFunctionBegin; 77 switch (op) { 78 case MAT_CUSPARSE_MULT: 79 cusparsestruct->format = format; 80 break; 81 case MAT_CUSPARSE_ALL: 82 cusparsestruct->format = format; 83 break; 84 default: 85 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 86 } 87 PetscFunctionReturn(PETSC_SUCCESS); 88 } 89 90 /*@ 91 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 92 operation. Only the `MatMult()` operation can use different GPU storage formats 93 94 Not Collective 95 96 Input Parameters: 97 + A - Matrix of type `MATSEQAIJCUSPARSE` 98 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 99 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 101 102 Level: intermediate 103 104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 105 @*/ 106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 107 { 108 PetscFunctionBegin; 109 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 110 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 117 118 PetscFunctionBegin; 119 cusparsestruct->use_cpu_solve = use_cpu; 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /*@ 124 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 125 126 Input Parameters: 127 + A - Matrix of type `MATSEQAIJCUSPARSE` 128 - use_cpu - set flag for using the built-in CPU `MatSolve()` 129 130 Level: intermediate 131 132 Note: 133 The cuSparse LU solver currently computes the factors with the built-in CPU method 134 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 135 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 136 137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 138 @*/ 139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 140 { 141 PetscFunctionBegin; 142 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 143 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 148 { 149 PetscFunctionBegin; 150 switch (op) { 151 case MAT_FORM_EXPLICIT_TRANSPOSE: 152 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 153 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 154 A->form_explicit_transpose = flg; 155 break; 156 default: 157 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 158 break; 159 } 160 PetscFunctionReturn(PETSC_SUCCESS); 161 } 162 163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 164 { 165 MatCUSPARSEStorageFormat format; 166 PetscBool flg; 167 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 168 169 PetscFunctionBegin; 170 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 171 if (A->factortype == MAT_FACTOR_NONE) { 172 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 173 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 174 175 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 176 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 177 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 178 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 180 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 181 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 182 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 183 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 184 #else 185 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 186 #endif 187 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 188 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 189 190 PetscCall( 191 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 192 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 193 #endif 194 } 195 PetscOptionsHeadEnd(); 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 201 { 202 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 203 PetscInt m = A->rmap->n; 204 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 205 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 206 const MatScalar *Aa = a->a; 207 PetscInt *Mi, *Mj, Mnz; 208 PetscScalar *Ma; 209 210 PetscFunctionBegin; 211 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 212 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 213 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 214 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 215 PetscCall(PetscMalloc1(m + 1, &Mi)); 216 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 217 PetscCall(PetscMalloc1(Mnz, &Ma)); 218 Mi[0] = 0; 219 for (PetscInt i = 0; i < m; i++) { 220 PetscInt llen = Ai[i + 1] - Ai[i]; 221 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 222 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 223 Mj[Mi[i] + llen] = i; // diagonal entry 224 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 225 Mi[i + 1] = Mi[i] + llen + ulen; 226 } 227 // Copy M (L,U) from host to device 228 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 229 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 230 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 231 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 232 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 233 234 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 235 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 236 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 237 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 238 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 239 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 240 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 241 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 242 243 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 244 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 245 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 246 247 fillMode = CUSPARSE_FILL_MODE_UPPER; 248 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 249 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 250 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 251 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 252 253 // Allocate work vectors in SpSv 254 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 255 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 256 257 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 258 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 259 260 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 261 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 262 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 263 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 264 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 265 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 266 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 267 268 // Record for reuse 269 fs->csrRowPtr_h = Mi; 270 fs->csrVal_h = Ma; 271 PetscCall(PetscFree(Mj)); 272 } 273 // Copy the value 274 Mi = fs->csrRowPtr_h; 275 Ma = fs->csrVal_h; 276 Mnz = Mi[m]; 277 for (PetscInt i = 0; i < m; i++) { 278 PetscInt llen = Ai[i + 1] - Ai[i]; 279 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 280 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 281 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 282 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 283 } 284 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 285 286 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 287 if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 288 // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 289 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 290 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 291 } else 292 #endif 293 { 294 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 295 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 296 297 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 298 fs->updatedSpSVAnalysis = PETSC_TRUE; 299 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 300 } 301 } 302 PetscFunctionReturn(PETSC_SUCCESS); 303 } 304 #else 305 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 306 { 307 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 308 PetscInt n = A->rmap->n; 309 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 310 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 311 const PetscInt *ai = a->i, *aj = a->j, *vi; 312 const MatScalar *aa = a->a, *v; 313 PetscInt *AiLo, *AjLo; 314 PetscInt i, nz, nzLower, offset, rowOffset; 315 316 PetscFunctionBegin; 317 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 318 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 319 try { 320 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 321 nzLower = n + ai[n] - ai[1]; 322 if (!loTriFactor) { 323 PetscScalar *AALo; 324 325 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 326 327 /* Allocate Space for the lower triangular matrix */ 328 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 329 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 330 331 /* Fill the lower triangular matrix */ 332 AiLo[0] = (PetscInt)0; 333 AiLo[n] = nzLower; 334 AjLo[0] = (PetscInt)0; 335 AALo[0] = (MatScalar)1.0; 336 v = aa; 337 vi = aj; 338 offset = 1; 339 rowOffset = 1; 340 for (i = 1; i < n; i++) { 341 nz = ai[i + 1] - ai[i]; 342 /* additional 1 for the term on the diagonal */ 343 AiLo[i] = rowOffset; 344 rowOffset += nz + 1; 345 346 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 347 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 348 349 offset += nz; 350 AjLo[offset] = (PetscInt)i; 351 AALo[offset] = (MatScalar)1.0; 352 offset += 1; 353 354 v += nz; 355 vi += nz; 356 } 357 358 /* allocate space for the triangular factor information */ 359 PetscCall(PetscNew(&loTriFactor)); 360 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 361 /* Create the matrix description */ 362 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 363 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 364 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 365 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 366 #else 367 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 368 #endif 369 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 370 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 371 372 /* set the operation */ 373 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 374 375 /* set the matrix */ 376 loTriFactor->csrMat = new CsrMatrix; 377 loTriFactor->csrMat->num_rows = n; 378 loTriFactor->csrMat->num_cols = n; 379 loTriFactor->csrMat->num_entries = nzLower; 380 381 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 382 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 383 384 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 385 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 386 387 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 388 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 389 390 /* Create the solve analysis information */ 391 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 392 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 393 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 394 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 395 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 396 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 397 #endif 398 399 /* perform the solve analysis */ 400 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 401 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 402 PetscCallCUDA(WaitForCUDA()); 403 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 404 405 /* assign the pointer */ 406 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 407 loTriFactor->AA_h = AALo; 408 PetscCallCUDA(cudaFreeHost(AiLo)); 409 PetscCallCUDA(cudaFreeHost(AjLo)); 410 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 411 } else { /* update values only */ 412 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 413 /* Fill the lower triangular matrix */ 414 loTriFactor->AA_h[0] = 1.0; 415 v = aa; 416 vi = aj; 417 offset = 1; 418 for (i = 1; i < n; i++) { 419 nz = ai[i + 1] - ai[i]; 420 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 421 offset += nz; 422 loTriFactor->AA_h[offset] = 1.0; 423 offset += 1; 424 v += nz; 425 } 426 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 427 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 428 } 429 } catch (char *ex) { 430 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 431 } 432 } 433 PetscFunctionReturn(PETSC_SUCCESS); 434 } 435 436 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 437 { 438 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 439 PetscInt n = A->rmap->n; 440 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 441 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 442 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 443 const MatScalar *aa = a->a, *v; 444 PetscInt *AiUp, *AjUp; 445 PetscInt i, nz, nzUpper, offset; 446 447 PetscFunctionBegin; 448 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 449 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 450 try { 451 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 452 nzUpper = adiag[0] - adiag[n]; 453 if (!upTriFactor) { 454 PetscScalar *AAUp; 455 456 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 457 458 /* Allocate Space for the upper triangular matrix */ 459 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 460 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 461 462 /* Fill the upper triangular matrix */ 463 AiUp[0] = (PetscInt)0; 464 AiUp[n] = nzUpper; 465 offset = nzUpper; 466 for (i = n - 1; i >= 0; i--) { 467 v = aa + adiag[i + 1] + 1; 468 vi = aj + adiag[i + 1] + 1; 469 470 /* number of elements NOT on the diagonal */ 471 nz = adiag[i] - adiag[i + 1] - 1; 472 473 /* decrement the offset */ 474 offset -= (nz + 1); 475 476 /* first, set the diagonal elements */ 477 AjUp[offset] = (PetscInt)i; 478 AAUp[offset] = (MatScalar)1. / v[nz]; 479 AiUp[i] = AiUp[i + 1] - (nz + 1); 480 481 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 482 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 483 } 484 485 /* allocate space for the triangular factor information */ 486 PetscCall(PetscNew(&upTriFactor)); 487 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 488 489 /* Create the matrix description */ 490 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 491 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 492 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 493 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 494 #else 495 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 496 #endif 497 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 498 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 499 500 /* set the operation */ 501 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 502 503 /* set the matrix */ 504 upTriFactor->csrMat = new CsrMatrix; 505 upTriFactor->csrMat->num_rows = n; 506 upTriFactor->csrMat->num_cols = n; 507 upTriFactor->csrMat->num_entries = nzUpper; 508 509 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 510 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 511 512 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 513 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 514 515 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 516 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 517 518 /* Create the solve analysis information */ 519 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 520 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 521 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 522 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 523 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 524 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 525 #endif 526 527 /* perform the solve analysis */ 528 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 529 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 530 531 PetscCallCUDA(WaitForCUDA()); 532 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 533 534 /* assign the pointer */ 535 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 536 upTriFactor->AA_h = AAUp; 537 PetscCallCUDA(cudaFreeHost(AiUp)); 538 PetscCallCUDA(cudaFreeHost(AjUp)); 539 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 540 } else { 541 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 542 /* Fill the upper triangular matrix */ 543 offset = nzUpper; 544 for (i = n - 1; i >= 0; i--) { 545 v = aa + adiag[i + 1] + 1; 546 547 /* number of elements NOT on the diagonal */ 548 nz = adiag[i] - adiag[i + 1] - 1; 549 550 /* decrement the offset */ 551 offset -= (nz + 1); 552 553 /* first, set the diagonal elements */ 554 upTriFactor->AA_h[offset] = 1. / v[nz]; 555 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 556 } 557 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 558 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 559 } 560 } catch (char *ex) { 561 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 562 } 563 } 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 #endif 567 568 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 569 { 570 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 571 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 572 IS isrow = a->row, isicol = a->icol; 573 PetscBool row_identity, col_identity; 574 PetscInt n = A->rmap->n; 575 576 PetscFunctionBegin; 577 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 578 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 579 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 580 #else 581 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 582 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 583 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 584 #endif 585 586 cusparseTriFactors->nnz = a->nz; 587 588 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 589 /* lower triangular indices */ 590 PetscCall(ISIdentity(isrow, &row_identity)); 591 if (!row_identity && !cusparseTriFactors->rpermIndices) { 592 const PetscInt *r; 593 594 PetscCall(ISGetIndices(isrow, &r)); 595 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 596 cusparseTriFactors->rpermIndices->assign(r, r + n); 597 PetscCall(ISRestoreIndices(isrow, &r)); 598 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 599 } 600 601 /* upper triangular indices */ 602 PetscCall(ISIdentity(isicol, &col_identity)); 603 if (!col_identity && !cusparseTriFactors->cpermIndices) { 604 const PetscInt *c; 605 606 PetscCall(ISGetIndices(isicol, &c)); 607 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 608 cusparseTriFactors->cpermIndices->assign(c, c + n); 609 PetscCall(ISRestoreIndices(isicol, &c)); 610 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 611 } 612 PetscFunctionReturn(PETSC_SUCCESS); 613 } 614 615 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 616 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 617 { 618 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 619 PetscInt m = A->rmap->n; 620 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 621 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 622 const MatScalar *Aa = a->a; 623 PetscInt *Mj, Mnz; 624 PetscScalar *Ma, *D; 625 626 PetscFunctionBegin; 627 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 628 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 629 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 630 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 631 Mnz = Ai[m]; // Unz (with the unit diagonal) 632 PetscCall(PetscMalloc1(Mnz, &Ma)); 633 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 634 PetscCall(PetscMalloc1(m, &D)); // the diagonal 635 for (PetscInt i = 0; i < m; i++) { 636 PetscInt ulen = Ai[i + 1] - Ai[i]; 637 Mj[Ai[i]] = i; // diagonal entry 638 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 639 } 640 // Copy M (U) from host to device 641 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 642 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 643 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 644 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 645 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 646 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 647 648 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 649 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 650 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 651 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 652 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 653 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 654 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 655 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 656 657 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 658 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 659 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 660 661 // Allocate work vectors in SpSv 662 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 663 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 664 665 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 666 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 667 668 // Query buffer sizes for SpSV and then allocate buffers 669 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 670 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 671 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 672 673 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 674 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 675 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 676 677 // Record for reuse 678 fs->csrVal_h = Ma; 679 fs->diag_h = D; 680 PetscCall(PetscFree(Mj)); 681 } 682 // Copy the value 683 Ma = fs->csrVal_h; 684 D = fs->diag_h; 685 Mnz = Ai[m]; 686 for (PetscInt i = 0; i < m; i++) { 687 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 688 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 689 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 690 } 691 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 692 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 693 694 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 695 if (fs->updatedSpSVAnalysis) { 696 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 697 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 698 } else 699 #endif 700 { 701 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 702 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 703 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 704 fs->updatedSpSVAnalysis = PETSC_TRUE; 705 } 706 } 707 PetscFunctionReturn(PETSC_SUCCESS); 708 } 709 710 // Solve Ut D U x = b 711 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 712 { 713 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 714 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 715 const PetscScalar *barray; 716 PetscScalar *xarray; 717 thrust::device_ptr<const PetscScalar> bGPU; 718 thrust::device_ptr<PetscScalar> xGPU; 719 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 720 PetscInt m = A->rmap->n; 721 722 PetscFunctionBegin; 723 PetscCall(PetscLogGpuTimeBegin()); 724 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 725 PetscCall(VecCUDAGetArrayRead(b, &barray)); 726 xGPU = thrust::device_pointer_cast(xarray); 727 bGPU = thrust::device_pointer_cast(barray); 728 729 // Reorder b with the row permutation if needed, and wrap the result in fs->X 730 if (fs->rpermIndices) { 731 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 732 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 733 } else { 734 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 735 } 736 737 // Solve Ut Y = X 738 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 739 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 740 741 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 742 // It is basically a vector element-wise multiplication, but cublas does not have it! 743 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 744 745 // Solve U X = Y 746 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 747 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 748 } else { 749 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 750 } 751 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 752 753 // Reorder X with the column permutation if needed, and put the result back to x 754 if (fs->cpermIndices) { 755 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 756 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 757 } 758 759 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 760 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 761 PetscCall(PetscLogGpuTimeEnd()); 762 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 763 PetscFunctionReturn(PETSC_SUCCESS); 764 } 765 #else 766 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 767 { 768 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 769 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 770 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 771 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 772 PetscInt *AiUp, *AjUp; 773 PetscScalar *AAUp; 774 PetscScalar *AALo; 775 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 776 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 777 const PetscInt *ai = b->i, *aj = b->j, *vj; 778 const MatScalar *aa = b->a, *v; 779 780 PetscFunctionBegin; 781 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 782 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 783 try { 784 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 785 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 786 if (!upTriFactor && !loTriFactor) { 787 /* Allocate Space for the upper triangular matrix */ 788 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 789 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 790 791 /* Fill the upper triangular matrix */ 792 AiUp[0] = (PetscInt)0; 793 AiUp[n] = nzUpper; 794 offset = 0; 795 for (i = 0; i < n; i++) { 796 /* set the pointers */ 797 v = aa + ai[i]; 798 vj = aj + ai[i]; 799 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 800 801 /* first, set the diagonal elements */ 802 AjUp[offset] = (PetscInt)i; 803 AAUp[offset] = (MatScalar)1.0 / v[nz]; 804 AiUp[i] = offset; 805 AALo[offset] = (MatScalar)1.0 / v[nz]; 806 807 offset += 1; 808 if (nz > 0) { 809 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 810 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 811 for (j = offset; j < offset + nz; j++) { 812 AAUp[j] = -AAUp[j]; 813 AALo[j] = AAUp[j] / v[nz]; 814 } 815 offset += nz; 816 } 817 } 818 819 /* allocate space for the triangular factor information */ 820 PetscCall(PetscNew(&upTriFactor)); 821 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 822 823 /* Create the matrix description */ 824 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 825 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 826 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 827 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 828 #else 829 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 830 #endif 831 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 832 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 833 834 /* set the matrix */ 835 upTriFactor->csrMat = new CsrMatrix; 836 upTriFactor->csrMat->num_rows = A->rmap->n; 837 upTriFactor->csrMat->num_cols = A->cmap->n; 838 upTriFactor->csrMat->num_entries = a->nz; 839 840 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 841 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 842 843 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 844 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 845 846 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 847 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 848 849 /* set the operation */ 850 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 851 852 /* Create the solve analysis information */ 853 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 854 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 855 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 856 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 857 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 858 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 859 #endif 860 861 /* perform the solve analysis */ 862 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 863 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 864 865 PetscCallCUDA(WaitForCUDA()); 866 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 867 868 /* assign the pointer */ 869 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 870 871 /* allocate space for the triangular factor information */ 872 PetscCall(PetscNew(&loTriFactor)); 873 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 874 875 /* Create the matrix description */ 876 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 877 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 878 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 879 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 880 #else 881 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 882 #endif 883 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 884 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 885 886 /* set the operation */ 887 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 888 889 /* set the matrix */ 890 loTriFactor->csrMat = new CsrMatrix; 891 loTriFactor->csrMat->num_rows = A->rmap->n; 892 loTriFactor->csrMat->num_cols = A->cmap->n; 893 loTriFactor->csrMat->num_entries = a->nz; 894 895 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 896 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 897 898 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 899 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 900 901 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 902 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 903 904 /* Create the solve analysis information */ 905 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 906 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 907 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 908 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 909 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 910 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 911 #endif 912 913 /* perform the solve analysis */ 914 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 915 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 916 917 PetscCallCUDA(WaitForCUDA()); 918 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 919 920 /* assign the pointer */ 921 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 922 923 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 924 PetscCallCUDA(cudaFreeHost(AiUp)); 925 PetscCallCUDA(cudaFreeHost(AjUp)); 926 } else { 927 /* Fill the upper triangular matrix */ 928 offset = 0; 929 for (i = 0; i < n; i++) { 930 /* set the pointers */ 931 v = aa + ai[i]; 932 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 933 934 /* first, set the diagonal elements */ 935 AAUp[offset] = 1.0 / v[nz]; 936 AALo[offset] = 1.0 / v[nz]; 937 938 offset += 1; 939 if (nz > 0) { 940 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 941 for (j = offset; j < offset + nz; j++) { 942 AAUp[j] = -AAUp[j]; 943 AALo[j] = AAUp[j] / v[nz]; 944 } 945 offset += nz; 946 } 947 } 948 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 949 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 950 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 951 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 952 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 953 } 954 PetscCallCUDA(cudaFreeHost(AAUp)); 955 PetscCallCUDA(cudaFreeHost(AALo)); 956 } catch (char *ex) { 957 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 958 } 959 } 960 PetscFunctionReturn(PETSC_SUCCESS); 961 } 962 #endif 963 964 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 965 { 966 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 967 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 968 IS ip = a->row; 969 PetscBool perm_identity; 970 PetscInt n = A->rmap->n; 971 972 PetscFunctionBegin; 973 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 974 975 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 976 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 977 #else 978 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 979 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 980 #endif 981 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 982 983 A->offloadmask = PETSC_OFFLOAD_BOTH; 984 985 /* lower triangular indices */ 986 PetscCall(ISIdentity(ip, &perm_identity)); 987 if (!perm_identity) { 988 IS iip; 989 const PetscInt *irip, *rip; 990 991 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 992 PetscCall(ISGetIndices(iip, &irip)); 993 PetscCall(ISGetIndices(ip, &rip)); 994 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 995 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 996 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 997 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 998 PetscCall(ISRestoreIndices(iip, &irip)); 999 PetscCall(ISDestroy(&iip)); 1000 PetscCall(ISRestoreIndices(ip, &rip)); 1001 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1002 } 1003 PetscFunctionReturn(PETSC_SUCCESS); 1004 } 1005 1006 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1007 { 1008 PetscFunctionBegin; 1009 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1010 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1011 B->offloadmask = PETSC_OFFLOAD_CPU; 1012 1013 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1014 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1015 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1016 #else 1017 /* determine which version of MatSolve needs to be used. */ 1018 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1019 IS ip = b->row; 1020 PetscBool perm_identity; 1021 1022 PetscCall(ISIdentity(ip, &perm_identity)); 1023 if (perm_identity) { 1024 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1025 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1026 } else { 1027 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1028 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1029 } 1030 #endif 1031 B->ops->matsolve = NULL; 1032 B->ops->matsolvetranspose = NULL; 1033 1034 /* get the triangular factors */ 1035 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1036 PetscFunctionReturn(PETSC_SUCCESS); 1037 } 1038 1039 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1040 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1041 { 1042 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1043 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1044 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1045 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1046 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1047 cusparseIndexBase_t indexBase; 1048 cusparseMatrixType_t matrixType; 1049 cusparseFillMode_t fillMode; 1050 cusparseDiagType_t diagType; 1051 1052 PetscFunctionBegin; 1053 /* allocate space for the transpose of the lower triangular factor */ 1054 PetscCall(PetscNew(&loTriFactorT)); 1055 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1056 1057 /* set the matrix descriptors of the lower triangular factor */ 1058 matrixType = cusparseGetMatType(loTriFactor->descr); 1059 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1060 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1061 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1062 1063 /* Create the matrix description */ 1064 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1065 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1066 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1067 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1068 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1069 1070 /* set the operation */ 1071 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1072 1073 /* allocate GPU space for the CSC of the lower triangular factor*/ 1074 loTriFactorT->csrMat = new CsrMatrix; 1075 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1076 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1077 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1078 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1079 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1080 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1081 1082 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1083 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1084 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1085 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1086 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1087 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1088 #endif 1089 1090 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1091 { 1092 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1093 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1094 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1095 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1096 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1097 #else 1098 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1099 #endif 1100 PetscCallCUSPARSE(stat); 1101 } 1102 1103 PetscCallCUDA(WaitForCUDA()); 1104 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1105 1106 /* Create the solve analysis information */ 1107 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1108 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1109 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1110 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1111 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1112 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1113 #endif 1114 1115 /* perform the solve analysis */ 1116 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1117 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1118 1119 PetscCallCUDA(WaitForCUDA()); 1120 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1121 1122 /* assign the pointer */ 1123 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1124 1125 /*********************************************/ 1126 /* Now the Transpose of the Upper Tri Factor */ 1127 /*********************************************/ 1128 1129 /* allocate space for the transpose of the upper triangular factor */ 1130 PetscCall(PetscNew(&upTriFactorT)); 1131 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1132 1133 /* set the matrix descriptors of the upper triangular factor */ 1134 matrixType = cusparseGetMatType(upTriFactor->descr); 1135 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1136 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1137 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1138 1139 /* Create the matrix description */ 1140 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1141 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1142 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1143 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1144 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1145 1146 /* set the operation */ 1147 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1148 1149 /* allocate GPU space for the CSC of the upper triangular factor*/ 1150 upTriFactorT->csrMat = new CsrMatrix; 1151 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1152 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1153 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1154 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1155 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1156 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1157 1158 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1160 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1161 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1163 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1164 #endif 1165 1166 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1167 { 1168 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1169 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1170 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1172 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1173 #else 1174 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1175 #endif 1176 PetscCallCUSPARSE(stat); 1177 } 1178 1179 PetscCallCUDA(WaitForCUDA()); 1180 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1181 1182 /* Create the solve analysis information */ 1183 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1184 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1185 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1186 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1187 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1188 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1189 #endif 1190 1191 /* perform the solve analysis */ 1192 /* christ, would it have killed you to put this stuff in a function????????? */ 1193 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1194 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1195 1196 PetscCallCUDA(WaitForCUDA()); 1197 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1198 1199 /* assign the pointer */ 1200 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1201 PetscFunctionReturn(PETSC_SUCCESS); 1202 } 1203 #endif 1204 1205 struct PetscScalarToPetscInt { 1206 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1207 }; 1208 1209 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1210 { 1211 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1212 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1213 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1214 cusparseStatus_t stat; 1215 cusparseIndexBase_t indexBase; 1216 1217 PetscFunctionBegin; 1218 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1219 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1220 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1221 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1222 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1223 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1224 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1225 PetscCall(PetscLogGpuTimeBegin()); 1226 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1227 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1228 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1229 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1230 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1231 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1232 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1233 1234 /* set alpha and beta */ 1235 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1236 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1237 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1238 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1239 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1240 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1241 1242 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1243 CsrMatrix *matrixT = new CsrMatrix; 1244 matstructT->mat = matrixT; 1245 matrixT->num_rows = A->cmap->n; 1246 matrixT->num_cols = A->rmap->n; 1247 matrixT->num_entries = a->nz; 1248 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1249 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1250 matrixT->values = new THRUSTARRAY(a->nz); 1251 1252 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1253 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1254 1255 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1256 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1257 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1258 indexBase, cusparse_scalartype); 1259 PetscCallCUSPARSE(stat); 1260 #else 1261 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1262 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1263 1264 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1265 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1266 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1267 */ 1268 if (matrixT->num_entries) { 1269 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1270 PetscCallCUSPARSE(stat); 1271 1272 } else { 1273 matstructT->matDescr = NULL; 1274 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1275 } 1276 #endif 1277 #endif 1278 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1279 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1280 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1281 #else 1282 CsrMatrix *temp = new CsrMatrix; 1283 CsrMatrix *tempT = new CsrMatrix; 1284 /* First convert HYB to CSR */ 1285 temp->num_rows = A->rmap->n; 1286 temp->num_cols = A->cmap->n; 1287 temp->num_entries = a->nz; 1288 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1289 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1290 temp->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1293 PetscCallCUSPARSE(stat); 1294 1295 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1296 tempT->num_rows = A->rmap->n; 1297 tempT->num_cols = A->cmap->n; 1298 tempT->num_entries = a->nz; 1299 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1300 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1301 tempT->values = new THRUSTARRAY(a->nz); 1302 1303 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1304 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1305 PetscCallCUSPARSE(stat); 1306 1307 /* Last, convert CSC to HYB */ 1308 cusparseHybMat_t hybMat; 1309 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1310 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1311 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1312 PetscCallCUSPARSE(stat); 1313 1314 /* assign the pointer */ 1315 matstructT->mat = hybMat; 1316 A->transupdated = PETSC_TRUE; 1317 /* delete temporaries */ 1318 if (tempT) { 1319 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1320 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1321 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1322 delete (CsrMatrix *)tempT; 1323 } 1324 if (temp) { 1325 if (temp->values) delete (THRUSTARRAY *)temp->values; 1326 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1327 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1328 delete (CsrMatrix *)temp; 1329 } 1330 #endif 1331 } 1332 } 1333 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1334 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1335 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1336 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1337 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1338 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1339 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1340 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1341 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1342 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1343 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1344 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1345 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1346 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1347 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1348 } 1349 if (!cusparsestruct->csr2csc_i) { 1350 THRUSTARRAY csr2csc_a(matrix->num_entries); 1351 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1352 1353 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1354 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1355 void *csr2cscBuffer; 1356 size_t csr2cscBufferSize; 1357 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1358 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1359 PetscCallCUSPARSE(stat); 1360 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1361 #endif 1362 1363 if (matrix->num_entries) { 1364 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1365 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1366 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1367 1368 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1369 should be filled with indexBase. So I just take a shortcut here. 1370 */ 1371 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1372 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1373 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1374 PetscCallCUSPARSE(stat); 1375 #else 1376 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1377 PetscCallCUSPARSE(stat); 1378 #endif 1379 } else { 1380 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1381 } 1382 1383 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1384 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1385 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1386 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1387 #endif 1388 } 1389 PetscCallThrust( 1390 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1391 } 1392 PetscCall(PetscLogGpuTimeEnd()); 1393 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1394 /* the compressed row indices is not used for matTranspose */ 1395 matstructT->cprowIndices = NULL; 1396 /* assign the pointer */ 1397 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1398 A->transupdated = PETSC_TRUE; 1399 PetscFunctionReturn(PETSC_SUCCESS); 1400 } 1401 1402 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1403 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1404 { 1405 const PetscScalar *barray; 1406 PetscScalar *xarray; 1407 thrust::device_ptr<const PetscScalar> bGPU; 1408 thrust::device_ptr<PetscScalar> xGPU; 1409 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1410 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1411 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1412 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1413 PetscInt m = A->rmap->n; 1414 1415 PetscFunctionBegin; 1416 PetscCall(PetscLogGpuTimeBegin()); 1417 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1418 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1419 xGPU = thrust::device_pointer_cast(xarray); 1420 bGPU = thrust::device_pointer_cast(barray); 1421 1422 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1423 if (fs->rpermIndices) { 1424 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1425 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1426 } else { 1427 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1428 } 1429 1430 // Solve L Y = X 1431 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1432 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1433 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1434 1435 // Solve U X = Y 1436 if (fs->cpermIndices) { 1437 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1438 } else { 1439 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1440 } 1441 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1442 1443 // Reorder X with the column permutation if needed, and put the result back to x 1444 if (fs->cpermIndices) { 1445 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1446 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1447 } 1448 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1449 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1450 PetscCall(PetscLogGpuTimeEnd()); 1451 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1452 PetscFunctionReturn(PETSC_SUCCESS); 1453 } 1454 1455 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1456 { 1457 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1458 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1459 const PetscScalar *barray; 1460 PetscScalar *xarray; 1461 thrust::device_ptr<const PetscScalar> bGPU; 1462 thrust::device_ptr<PetscScalar> xGPU; 1463 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1464 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1465 PetscInt m = A->rmap->n; 1466 1467 PetscFunctionBegin; 1468 PetscCall(PetscLogGpuTimeBegin()); 1469 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1470 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1471 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1472 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1473 1474 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1475 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1476 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1477 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1478 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1479 } 1480 1481 if (!fs->updatedTransposeSpSVAnalysis) { 1482 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1483 1484 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1485 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1486 } 1487 1488 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1489 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1490 xGPU = thrust::device_pointer_cast(xarray); 1491 bGPU = thrust::device_pointer_cast(barray); 1492 1493 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1494 if (fs->rpermIndices) { 1495 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1496 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1497 } else { 1498 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1499 } 1500 1501 // Solve Ut Y = X 1502 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1503 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1504 1505 // Solve Lt X = Y 1506 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1507 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1508 } else { 1509 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1510 } 1511 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1512 1513 // Reorder X with the column permutation if needed, and put the result back to x 1514 if (fs->cpermIndices) { 1515 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1516 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1517 } 1518 1519 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1520 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1521 PetscCall(PetscLogGpuTimeEnd()); 1522 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1523 PetscFunctionReturn(PETSC_SUCCESS); 1524 } 1525 #else 1526 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1527 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1528 { 1529 PetscInt n = xx->map->n; 1530 const PetscScalar *barray; 1531 PetscScalar *xarray; 1532 thrust::device_ptr<const PetscScalar> bGPU; 1533 thrust::device_ptr<PetscScalar> xGPU; 1534 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1535 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1536 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1537 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1538 1539 PetscFunctionBegin; 1540 /* Analyze the matrix and create the transpose ... on the fly */ 1541 if (!loTriFactorT && !upTriFactorT) { 1542 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1543 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1544 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1545 } 1546 1547 /* Get the GPU pointers */ 1548 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1549 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1550 xGPU = thrust::device_pointer_cast(xarray); 1551 bGPU = thrust::device_pointer_cast(barray); 1552 1553 PetscCall(PetscLogGpuTimeBegin()); 1554 /* First, reorder with the row permutation */ 1555 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1556 1557 /* First, solve U */ 1558 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1559 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1560 1561 /* Then, solve L */ 1562 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1563 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1564 1565 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1566 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1567 1568 /* Copy the temporary to the full solution. */ 1569 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1570 1571 /* restore */ 1572 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1573 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1574 PetscCall(PetscLogGpuTimeEnd()); 1575 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1576 PetscFunctionReturn(PETSC_SUCCESS); 1577 } 1578 1579 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1580 { 1581 const PetscScalar *barray; 1582 PetscScalar *xarray; 1583 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1584 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1585 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1586 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1587 1588 PetscFunctionBegin; 1589 /* Analyze the matrix and create the transpose ... on the fly */ 1590 if (!loTriFactorT && !upTriFactorT) { 1591 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1592 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1593 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1594 } 1595 1596 /* Get the GPU pointers */ 1597 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1598 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1599 1600 PetscCall(PetscLogGpuTimeBegin()); 1601 /* First, solve U */ 1602 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1603 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1604 1605 /* Then, solve L */ 1606 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1607 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1608 1609 /* restore */ 1610 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1611 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1612 PetscCall(PetscLogGpuTimeEnd()); 1613 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1614 PetscFunctionReturn(PETSC_SUCCESS); 1615 } 1616 1617 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1618 { 1619 const PetscScalar *barray; 1620 PetscScalar *xarray; 1621 thrust::device_ptr<const PetscScalar> bGPU; 1622 thrust::device_ptr<PetscScalar> xGPU; 1623 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1624 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1625 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1626 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1627 1628 PetscFunctionBegin; 1629 /* Get the GPU pointers */ 1630 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1631 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1632 xGPU = thrust::device_pointer_cast(xarray); 1633 bGPU = thrust::device_pointer_cast(barray); 1634 1635 PetscCall(PetscLogGpuTimeBegin()); 1636 /* First, reorder with the row permutation */ 1637 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1638 1639 /* Next, solve L */ 1640 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1641 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1642 1643 /* Then, solve U */ 1644 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1645 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1646 1647 /* Last, reorder with the column permutation */ 1648 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1649 1650 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1651 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1652 PetscCall(PetscLogGpuTimeEnd()); 1653 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1654 PetscFunctionReturn(PETSC_SUCCESS); 1655 } 1656 1657 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1658 { 1659 const PetscScalar *barray; 1660 PetscScalar *xarray; 1661 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1662 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1663 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1664 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1665 1666 PetscFunctionBegin; 1667 /* Get the GPU pointers */ 1668 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1669 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1670 1671 PetscCall(PetscLogGpuTimeBegin()); 1672 /* First, solve L */ 1673 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1674 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1675 1676 /* Next, solve U */ 1677 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1678 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1679 1680 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1681 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1682 PetscCall(PetscLogGpuTimeEnd()); 1683 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1684 PetscFunctionReturn(PETSC_SUCCESS); 1685 } 1686 #endif 1687 1688 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1689 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1690 { 1691 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1692 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1693 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1694 CsrMatrix *Acsr; 1695 PetscInt m, nz; 1696 PetscBool flg; 1697 1698 PetscFunctionBegin; 1699 if (PetscDefined(USE_DEBUG)) { 1700 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1701 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1702 } 1703 1704 /* Copy A's value to fact */ 1705 m = fact->rmap->n; 1706 nz = aij->nz; 1707 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1708 Acsr = (CsrMatrix *)Acusp->mat->mat; 1709 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1710 1711 PetscCall(PetscLogGpuTimeBegin()); 1712 /* Factorize fact inplace */ 1713 if (m) 1714 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1715 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1716 if (PetscDefined(USE_DEBUG)) { 1717 int numerical_zero; 1718 cusparseStatus_t status; 1719 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1720 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1721 } 1722 1723 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1724 if (fs->updatedSpSVAnalysis) { 1725 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1726 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1727 } else 1728 #endif 1729 { 1730 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1731 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1732 */ 1733 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1734 1735 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1736 1737 fs->updatedSpSVAnalysis = PETSC_TRUE; 1738 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1739 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1740 } 1741 1742 fact->offloadmask = PETSC_OFFLOAD_GPU; 1743 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1744 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1745 fact->ops->matsolve = NULL; 1746 fact->ops->matsolvetranspose = NULL; 1747 PetscCall(PetscLogGpuTimeEnd()); 1748 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1749 PetscFunctionReturn(PETSC_SUCCESS); 1750 } 1751 1752 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1753 { 1754 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1755 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1756 PetscInt m, nz; 1757 1758 PetscFunctionBegin; 1759 if (PetscDefined(USE_DEBUG)) { 1760 PetscInt i; 1761 PetscBool flg, missing; 1762 1763 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1764 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1765 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1766 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1767 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1768 } 1769 1770 /* Free the old stale stuff */ 1771 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1772 1773 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1774 but they will not be used. Allocate them just for easy debugging. 1775 */ 1776 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1777 1778 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1779 fact->factortype = MAT_FACTOR_ILU; 1780 fact->info.factor_mallocs = 0; 1781 fact->info.fill_ratio_given = info->fill; 1782 fact->info.fill_ratio_needed = 1.0; 1783 1784 aij->row = NULL; 1785 aij->col = NULL; 1786 1787 /* ====================================================================== */ 1788 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1789 /* We'll do in-place factorization on fact */ 1790 /* ====================================================================== */ 1791 const int *Ai, *Aj; 1792 1793 m = fact->rmap->n; 1794 nz = aij->nz; 1795 1796 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1797 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1798 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1799 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1800 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1801 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1802 1803 /* ====================================================================== */ 1804 /* Create descriptors for M, L, U */ 1805 /* ====================================================================== */ 1806 cusparseFillMode_t fillMode; 1807 cusparseDiagType_t diagType; 1808 1809 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1810 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1811 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1812 1813 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1814 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1815 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1816 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1817 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1818 */ 1819 fillMode = CUSPARSE_FILL_MODE_LOWER; 1820 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1821 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1822 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1823 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1824 1825 fillMode = CUSPARSE_FILL_MODE_UPPER; 1826 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1827 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1828 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1829 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1830 1831 /* ========================================================================= */ 1832 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1833 /* ========================================================================= */ 1834 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1835 if (m) 1836 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1837 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1838 1839 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1840 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1841 1842 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1843 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1844 1845 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1846 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1847 1848 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1849 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1850 1851 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1852 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1853 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1854 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1855 */ 1856 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1857 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1858 fs->spsvBuffer_L = fs->factBuffer_M; 1859 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1860 } else { 1861 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1862 fs->spsvBuffer_U = fs->factBuffer_M; 1863 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1864 } 1865 1866 /* ========================================================================== */ 1867 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1868 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1869 /* ========================================================================== */ 1870 int structural_zero; 1871 cusparseStatus_t status; 1872 1873 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1874 if (m) 1875 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1876 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1877 if (PetscDefined(USE_DEBUG)) { 1878 /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1879 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1880 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1881 } 1882 1883 /* Estimate FLOPs of the numeric factorization */ 1884 { 1885 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1886 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1887 PetscLogDouble flops = 0.0; 1888 1889 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1890 Ai = Aseq->i; 1891 Adiag = Aseq->diag; 1892 for (PetscInt i = 0; i < m; i++) { 1893 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1894 nzRow = Ai[i + 1] - Ai[i]; 1895 nzLeft = Adiag[i] - Ai[i]; 1896 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1897 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1898 */ 1899 nzLeft = (nzRow - 1) / 2; 1900 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1901 } 1902 } 1903 fs->numericFactFlops = flops; 1904 } 1905 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1906 PetscFunctionReturn(PETSC_SUCCESS); 1907 } 1908 1909 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1910 { 1911 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1912 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1913 const PetscScalar *barray; 1914 PetscScalar *xarray; 1915 1916 PetscFunctionBegin; 1917 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1918 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1919 PetscCall(PetscLogGpuTimeBegin()); 1920 1921 /* Solve L*y = b */ 1922 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1923 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1924 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1925 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1926 1927 /* Solve Lt*x = y */ 1928 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1929 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1930 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1931 1932 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1933 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1934 1935 PetscCall(PetscLogGpuTimeEnd()); 1936 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1937 PetscFunctionReturn(PETSC_SUCCESS); 1938 } 1939 1940 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1941 { 1942 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1943 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1944 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1945 CsrMatrix *Acsr; 1946 PetscInt m, nz; 1947 PetscBool flg; 1948 1949 PetscFunctionBegin; 1950 if (PetscDefined(USE_DEBUG)) { 1951 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1952 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1953 } 1954 1955 /* Copy A's value to fact */ 1956 m = fact->rmap->n; 1957 nz = aij->nz; 1958 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1959 Acsr = (CsrMatrix *)Acusp->mat->mat; 1960 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1961 1962 /* Factorize fact inplace */ 1963 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1964 csric02() only takes the lower triangular part of matrix A to perform factorization. 1965 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1966 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1967 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1968 */ 1969 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1970 if (PetscDefined(USE_DEBUG)) { 1971 int numerical_zero; 1972 cusparseStatus_t status; 1973 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1974 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1975 } 1976 1977 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1978 if (fs->updatedSpSVAnalysis) { 1979 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1980 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1981 } else 1982 #endif 1983 { 1984 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1985 1986 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1987 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1988 */ 1989 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1990 fs->updatedSpSVAnalysis = PETSC_TRUE; 1991 } 1992 1993 fact->offloadmask = PETSC_OFFLOAD_GPU; 1994 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1995 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1996 fact->ops->matsolve = NULL; 1997 fact->ops->matsolvetranspose = NULL; 1998 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1999 PetscFunctionReturn(PETSC_SUCCESS); 2000 } 2001 2002 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2003 { 2004 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2005 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2006 PetscInt m, nz; 2007 2008 PetscFunctionBegin; 2009 if (PetscDefined(USE_DEBUG)) { 2010 PetscInt i; 2011 PetscBool flg, missing; 2012 2013 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2014 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2015 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2016 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2017 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2018 } 2019 2020 /* Free the old stale stuff */ 2021 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2022 2023 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2024 but they will not be used. Allocate them just for easy debugging. 2025 */ 2026 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2027 2028 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2029 fact->factortype = MAT_FACTOR_ICC; 2030 fact->info.factor_mallocs = 0; 2031 fact->info.fill_ratio_given = info->fill; 2032 fact->info.fill_ratio_needed = 1.0; 2033 2034 aij->row = NULL; 2035 aij->col = NULL; 2036 2037 /* ====================================================================== */ 2038 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2039 /* We'll do in-place factorization on fact */ 2040 /* ====================================================================== */ 2041 const int *Ai, *Aj; 2042 2043 m = fact->rmap->n; 2044 nz = aij->nz; 2045 2046 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2047 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2048 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2049 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2050 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2051 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2052 2053 /* ====================================================================== */ 2054 /* Create mat descriptors for M, L */ 2055 /* ====================================================================== */ 2056 cusparseFillMode_t fillMode; 2057 cusparseDiagType_t diagType; 2058 2059 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2060 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2061 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2062 2063 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2064 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2065 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2066 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2067 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2068 */ 2069 fillMode = CUSPARSE_FILL_MODE_LOWER; 2070 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2071 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2072 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2073 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2074 2075 /* ========================================================================= */ 2076 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2077 /* ========================================================================= */ 2078 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2079 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2080 2081 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2082 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2083 2084 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2085 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2086 2087 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2088 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2089 2090 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2091 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2092 2093 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2094 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2095 */ 2096 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2097 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2098 fs->spsvBuffer_L = fs->factBuffer_M; 2099 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2100 } else { 2101 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2102 fs->spsvBuffer_Lt = fs->factBuffer_M; 2103 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2104 } 2105 2106 /* ========================================================================== */ 2107 /* Perform analysis of ic0 on M */ 2108 /* The lower triangular part of M has the same sparsity pattern as L */ 2109 /* ========================================================================== */ 2110 int structural_zero; 2111 cusparseStatus_t status; 2112 2113 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2114 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2115 if (PetscDefined(USE_DEBUG)) { 2116 /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2117 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2118 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2119 } 2120 2121 /* Estimate FLOPs of the numeric factorization */ 2122 { 2123 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2124 PetscInt *Ai, nzRow, nzLeft; 2125 PetscLogDouble flops = 0.0; 2126 2127 Ai = Aseq->i; 2128 for (PetscInt i = 0; i < m; i++) { 2129 nzRow = Ai[i + 1] - Ai[i]; 2130 if (nzRow > 1) { 2131 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2132 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2133 */ 2134 nzLeft = (nzRow - 1) / 2; 2135 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2136 } 2137 } 2138 fs->numericFactFlops = flops; 2139 } 2140 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2141 PetscFunctionReturn(PETSC_SUCCESS); 2142 } 2143 #endif 2144 2145 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2146 { 2147 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2148 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2149 2150 PetscFunctionBegin; 2151 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2152 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2153 B->offloadmask = PETSC_OFFLOAD_CPU; 2154 2155 if (!cusparsestruct->use_cpu_solve) { 2156 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2157 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2158 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2159 #else 2160 /* determine which version of MatSolve needs to be used. */ 2161 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2162 IS isrow = b->row, iscol = b->col; 2163 PetscBool row_identity, col_identity; 2164 2165 PetscCall(ISIdentity(isrow, &row_identity)); 2166 PetscCall(ISIdentity(iscol, &col_identity)); 2167 if (row_identity && col_identity) { 2168 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2169 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2170 } else { 2171 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2172 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2173 } 2174 #endif 2175 } 2176 B->ops->matsolve = NULL; 2177 B->ops->matsolvetranspose = NULL; 2178 2179 /* get the triangular factors */ 2180 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2181 PetscFunctionReturn(PETSC_SUCCESS); 2182 } 2183 2184 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2185 { 2186 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2187 2188 PetscFunctionBegin; 2189 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2190 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2191 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2192 PetscFunctionReturn(PETSC_SUCCESS); 2193 } 2194 2195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2196 { 2197 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2198 2199 PetscFunctionBegin; 2200 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2201 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2202 if (!info->factoronhost) { 2203 PetscCall(ISIdentity(isrow, &row_identity)); 2204 PetscCall(ISIdentity(iscol, &col_identity)); 2205 } 2206 if (!info->levels && row_identity && col_identity) { 2207 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2208 } else 2209 #endif 2210 { 2211 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2212 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2213 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2214 } 2215 PetscFunctionReturn(PETSC_SUCCESS); 2216 } 2217 2218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2219 { 2220 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2221 2222 PetscFunctionBegin; 2223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2224 PetscBool perm_identity = PETSC_FALSE; 2225 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2226 if (!info->levels && perm_identity) { 2227 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2228 } else 2229 #endif 2230 { 2231 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2232 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2233 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2234 } 2235 PetscFunctionReturn(PETSC_SUCCESS); 2236 } 2237 2238 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2239 { 2240 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2241 2242 PetscFunctionBegin; 2243 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2244 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2245 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2246 PetscFunctionReturn(PETSC_SUCCESS); 2247 } 2248 2249 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2250 { 2251 PetscFunctionBegin; 2252 *type = MATSOLVERCUSPARSE; 2253 PetscFunctionReturn(PETSC_SUCCESS); 2254 } 2255 2256 /*MC 2257 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2258 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2259 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2260 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2261 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2262 algorithms are not recommended. This class does NOT support direct solver operations. 2263 2264 Level: beginner 2265 2266 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2267 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2268 M*/ 2269 2270 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2271 { 2272 PetscInt n = A->rmap->n; 2273 2274 PetscFunctionBegin; 2275 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2276 PetscCall(MatSetSizes(*B, n, n, n, n)); 2277 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2278 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2279 2280 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2281 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2282 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2283 if (!A->boundtocpu) { 2284 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2285 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2286 } else { 2287 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2288 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2289 } 2290 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2291 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2292 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2293 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2294 if (!A->boundtocpu) { 2295 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2296 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2297 } else { 2298 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2299 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2300 } 2301 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2302 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2303 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2304 2305 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2306 (*B)->canuseordering = PETSC_TRUE; 2307 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2308 PetscFunctionReturn(PETSC_SUCCESS); 2309 } 2310 2311 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2312 { 2313 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2314 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2315 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2316 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2317 #endif 2318 2319 PetscFunctionBegin; 2320 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2321 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2322 if (A->factortype == MAT_FACTOR_NONE) { 2323 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2324 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2325 } 2326 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2327 else if (fs->csrVal) { 2328 /* We have a factorized matrix on device and are able to copy it to host */ 2329 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2330 } 2331 #endif 2332 else 2333 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2334 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2335 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2336 A->offloadmask = PETSC_OFFLOAD_BOTH; 2337 } 2338 PetscFunctionReturn(PETSC_SUCCESS); 2339 } 2340 2341 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2342 { 2343 PetscFunctionBegin; 2344 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2345 *array = ((Mat_SeqAIJ *)A->data)->a; 2346 PetscFunctionReturn(PETSC_SUCCESS); 2347 } 2348 2349 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2350 { 2351 PetscFunctionBegin; 2352 A->offloadmask = PETSC_OFFLOAD_CPU; 2353 *array = NULL; 2354 PetscFunctionReturn(PETSC_SUCCESS); 2355 } 2356 2357 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2358 { 2359 PetscFunctionBegin; 2360 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2361 *array = ((Mat_SeqAIJ *)A->data)->a; 2362 PetscFunctionReturn(PETSC_SUCCESS); 2363 } 2364 2365 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2366 { 2367 PetscFunctionBegin; 2368 *array = NULL; 2369 PetscFunctionReturn(PETSC_SUCCESS); 2370 } 2371 2372 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2373 { 2374 PetscFunctionBegin; 2375 *array = ((Mat_SeqAIJ *)A->data)->a; 2376 PetscFunctionReturn(PETSC_SUCCESS); 2377 } 2378 2379 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2380 { 2381 PetscFunctionBegin; 2382 A->offloadmask = PETSC_OFFLOAD_CPU; 2383 *array = NULL; 2384 PetscFunctionReturn(PETSC_SUCCESS); 2385 } 2386 2387 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2388 { 2389 Mat_SeqAIJCUSPARSE *cusp; 2390 CsrMatrix *matrix; 2391 2392 PetscFunctionBegin; 2393 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2394 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2395 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2396 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2397 matrix = (CsrMatrix *)cusp->mat->mat; 2398 2399 if (i) { 2400 #if !defined(PETSC_USE_64BIT_INDICES) 2401 *i = matrix->row_offsets->data().get(); 2402 #else 2403 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2404 #endif 2405 } 2406 if (j) { 2407 #if !defined(PETSC_USE_64BIT_INDICES) 2408 *j = matrix->column_indices->data().get(); 2409 #else 2410 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2411 #endif 2412 } 2413 if (a) *a = matrix->values->data().get(); 2414 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2415 PetscFunctionReturn(PETSC_SUCCESS); 2416 } 2417 2418 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2419 { 2420 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2421 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2422 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2423 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2424 cusparseStatus_t stat; 2425 PetscBool both = PETSC_TRUE; 2426 2427 PetscFunctionBegin; 2428 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2429 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2430 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2431 CsrMatrix *matrix; 2432 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2433 2434 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2435 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2436 matrix->values->assign(a->a, a->a + a->nz); 2437 PetscCallCUDA(WaitForCUDA()); 2438 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2439 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2440 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2441 } else { 2442 PetscInt nnz; 2443 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2444 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2445 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2446 delete cusparsestruct->workVector; 2447 delete cusparsestruct->rowoffsets_gpu; 2448 cusparsestruct->workVector = NULL; 2449 cusparsestruct->rowoffsets_gpu = NULL; 2450 try { 2451 if (a->compressedrow.use) { 2452 m = a->compressedrow.nrows; 2453 ii = a->compressedrow.i; 2454 ridx = a->compressedrow.rindex; 2455 } else { 2456 m = A->rmap->n; 2457 ii = a->i; 2458 ridx = NULL; 2459 } 2460 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2461 if (!a->a) { 2462 nnz = ii[m]; 2463 both = PETSC_FALSE; 2464 } else nnz = a->nz; 2465 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2466 2467 /* create cusparse matrix */ 2468 cusparsestruct->nrows = m; 2469 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2470 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2471 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2472 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2473 2474 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2475 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2476 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2477 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2478 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2479 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2480 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2481 2482 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2483 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2484 /* set the matrix */ 2485 CsrMatrix *mat = new CsrMatrix; 2486 mat->num_rows = m; 2487 mat->num_cols = A->cmap->n; 2488 mat->num_entries = nnz; 2489 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2490 mat->row_offsets->assign(ii, ii + m + 1); 2491 2492 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2493 mat->column_indices->assign(a->j, a->j + nnz); 2494 2495 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2496 if (a->a) mat->values->assign(a->a, a->a + nnz); 2497 2498 /* assign the pointer */ 2499 matstruct->mat = mat; 2500 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2501 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2502 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2503 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2504 PetscCallCUSPARSE(stat); 2505 } 2506 #endif 2507 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2508 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2509 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2510 #else 2511 CsrMatrix *mat = new CsrMatrix; 2512 mat->num_rows = m; 2513 mat->num_cols = A->cmap->n; 2514 mat->num_entries = nnz; 2515 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2516 mat->row_offsets->assign(ii, ii + m + 1); 2517 2518 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2519 mat->column_indices->assign(a->j, a->j + nnz); 2520 2521 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2522 if (a->a) mat->values->assign(a->a, a->a + nnz); 2523 2524 cusparseHybMat_t hybMat; 2525 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2526 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2527 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2528 PetscCallCUSPARSE(stat); 2529 /* assign the pointer */ 2530 matstruct->mat = hybMat; 2531 2532 if (mat) { 2533 if (mat->values) delete (THRUSTARRAY *)mat->values; 2534 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2535 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2536 delete (CsrMatrix *)mat; 2537 } 2538 #endif 2539 } 2540 2541 /* assign the compressed row indices */ 2542 if (a->compressedrow.use) { 2543 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2544 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2545 matstruct->cprowIndices->assign(ridx, ridx + m); 2546 tmp = m; 2547 } else { 2548 cusparsestruct->workVector = NULL; 2549 matstruct->cprowIndices = NULL; 2550 tmp = 0; 2551 } 2552 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2553 2554 /* assign the pointer */ 2555 cusparsestruct->mat = matstruct; 2556 } catch (char *ex) { 2557 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2558 } 2559 PetscCallCUDA(WaitForCUDA()); 2560 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2561 cusparsestruct->nonzerostate = A->nonzerostate; 2562 } 2563 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2564 } 2565 PetscFunctionReturn(PETSC_SUCCESS); 2566 } 2567 2568 struct VecCUDAPlusEquals { 2569 template <typename Tuple> 2570 __host__ __device__ void operator()(Tuple t) 2571 { 2572 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2573 } 2574 }; 2575 2576 struct VecCUDAEquals { 2577 template <typename Tuple> 2578 __host__ __device__ void operator()(Tuple t) 2579 { 2580 thrust::get<1>(t) = thrust::get<0>(t); 2581 } 2582 }; 2583 2584 struct VecCUDAEqualsReverse { 2585 template <typename Tuple> 2586 __host__ __device__ void operator()(Tuple t) 2587 { 2588 thrust::get<0>(t) = thrust::get<1>(t); 2589 } 2590 }; 2591 2592 struct MatMatCusparse { 2593 PetscBool cisdense; 2594 PetscScalar *Bt; 2595 Mat X; 2596 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2597 PetscLogDouble flops; 2598 CsrMatrix *Bcsr; 2599 2600 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2601 cusparseSpMatDescr_t matSpBDescr; 2602 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2603 cusparseDnMatDescr_t matBDescr; 2604 cusparseDnMatDescr_t matCDescr; 2605 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2606 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2607 void *dBuffer4; 2608 void *dBuffer5; 2609 #endif 2610 size_t mmBufferSize; 2611 void *mmBuffer; 2612 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2613 cusparseSpGEMMDescr_t spgemmDesc; 2614 #endif 2615 }; 2616 2617 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2618 { 2619 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2620 2621 PetscFunctionBegin; 2622 PetscCallCUDA(cudaFree(mmdata->Bt)); 2623 delete mmdata->Bcsr; 2624 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2625 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2626 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2627 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2628 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2629 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2630 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2631 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2632 #endif 2633 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2634 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2635 #endif 2636 PetscCall(MatDestroy(&mmdata->X)); 2637 PetscCall(PetscFree(data)); 2638 PetscFunctionReturn(PETSC_SUCCESS); 2639 } 2640 2641 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2642 2643 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2644 { 2645 Mat_Product *product = C->product; 2646 Mat A, B; 2647 PetscInt m, n, blda, clda; 2648 PetscBool flg, biscuda; 2649 Mat_SeqAIJCUSPARSE *cusp; 2650 cusparseStatus_t stat; 2651 cusparseOperation_t opA; 2652 const PetscScalar *barray; 2653 PetscScalar *carray; 2654 MatMatCusparse *mmdata; 2655 Mat_SeqAIJCUSPARSEMultStruct *mat; 2656 CsrMatrix *csrmat; 2657 2658 PetscFunctionBegin; 2659 MatCheckProduct(C, 1); 2660 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2661 mmdata = (MatMatCusparse *)product->data; 2662 A = product->A; 2663 B = product->B; 2664 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2665 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2666 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2667 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2668 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2669 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2670 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2671 switch (product->type) { 2672 case MATPRODUCT_AB: 2673 case MATPRODUCT_PtAP: 2674 mat = cusp->mat; 2675 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2676 m = A->rmap->n; 2677 n = B->cmap->n; 2678 break; 2679 case MATPRODUCT_AtB: 2680 if (!A->form_explicit_transpose) { 2681 mat = cusp->mat; 2682 opA = CUSPARSE_OPERATION_TRANSPOSE; 2683 } else { 2684 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2685 mat = cusp->matTranspose; 2686 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2687 } 2688 m = A->cmap->n; 2689 n = B->cmap->n; 2690 break; 2691 case MATPRODUCT_ABt: 2692 case MATPRODUCT_RARt: 2693 mat = cusp->mat; 2694 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2695 m = A->rmap->n; 2696 n = B->rmap->n; 2697 break; 2698 default: 2699 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2700 } 2701 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2702 csrmat = (CsrMatrix *)mat->mat; 2703 /* if the user passed a CPU matrix, copy the data to the GPU */ 2704 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2705 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2706 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2707 2708 PetscCall(MatDenseGetLDA(B, &blda)); 2709 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2710 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2711 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2712 } else { 2713 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2714 PetscCall(MatDenseGetLDA(C, &clda)); 2715 } 2716 2717 PetscCall(PetscLogGpuTimeBegin()); 2718 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2719 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2720 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2721 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2722 #else 2723 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2724 #endif 2725 2726 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2727 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2728 size_t mmBufferSize; 2729 if (mmdata->initialized && mmdata->Blda != blda) { 2730 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2731 mmdata->matBDescr = NULL; 2732 } 2733 if (!mmdata->matBDescr) { 2734 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2735 mmdata->Blda = blda; 2736 } 2737 2738 if (mmdata->initialized && mmdata->Clda != clda) { 2739 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2740 mmdata->matCDescr = NULL; 2741 } 2742 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2743 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2744 mmdata->Clda = clda; 2745 } 2746 2747 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2748 if (matADescr) { 2749 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2750 matADescr = NULL; 2751 } 2752 #endif 2753 2754 if (!matADescr) { 2755 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2756 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2757 PetscCallCUSPARSE(stat); 2758 } 2759 2760 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2761 2762 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2763 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2764 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2765 mmdata->mmBufferSize = mmBufferSize; 2766 } 2767 2768 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0 2769 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2770 #endif 2771 2772 mmdata->initialized = PETSC_TRUE; 2773 } else { 2774 /* to be safe, always update pointers of the mats */ 2775 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2776 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2777 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2778 } 2779 2780 /* do cusparseSpMM, which supports transpose on B */ 2781 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2782 #else 2783 PetscInt k; 2784 /* cusparseXcsrmm does not support transpose on B */ 2785 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2786 cublasHandle_t cublasv2handle; 2787 cublasStatus_t cerr; 2788 2789 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2790 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2791 PetscCallCUBLAS(cerr); 2792 blda = B->cmap->n; 2793 k = B->cmap->n; 2794 } else { 2795 k = B->rmap->n; 2796 } 2797 2798 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2799 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2800 PetscCallCUSPARSE(stat); 2801 #endif 2802 PetscCall(PetscLogGpuTimeEnd()); 2803 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2804 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2805 if (product->type == MATPRODUCT_RARt) { 2806 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2807 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2808 } else if (product->type == MATPRODUCT_PtAP) { 2809 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2810 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2811 } else { 2812 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2813 } 2814 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2815 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2816 PetscFunctionReturn(PETSC_SUCCESS); 2817 } 2818 2819 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2820 { 2821 Mat_Product *product = C->product; 2822 Mat A, B; 2823 PetscInt m, n; 2824 PetscBool cisdense, flg; 2825 MatMatCusparse *mmdata; 2826 Mat_SeqAIJCUSPARSE *cusp; 2827 2828 PetscFunctionBegin; 2829 MatCheckProduct(C, 1); 2830 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2831 A = product->A; 2832 B = product->B; 2833 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2834 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2835 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2836 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2837 switch (product->type) { 2838 case MATPRODUCT_AB: 2839 m = A->rmap->n; 2840 n = B->cmap->n; 2841 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2842 break; 2843 case MATPRODUCT_AtB: 2844 m = A->cmap->n; 2845 n = B->cmap->n; 2846 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 2847 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2848 break; 2849 case MATPRODUCT_ABt: 2850 m = A->rmap->n; 2851 n = B->rmap->n; 2852 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 2853 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2854 break; 2855 case MATPRODUCT_PtAP: 2856 m = B->cmap->n; 2857 n = B->cmap->n; 2858 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 2859 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2860 break; 2861 case MATPRODUCT_RARt: 2862 m = B->rmap->n; 2863 n = B->rmap->n; 2864 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 2865 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2866 break; 2867 default: 2868 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2869 } 2870 PetscCall(MatSetSizes(C, m, n, m, n)); 2871 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2872 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2873 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2874 2875 /* product data */ 2876 PetscCall(PetscNew(&mmdata)); 2877 mmdata->cisdense = cisdense; 2878 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2879 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2880 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2881 #endif 2882 /* for these products we need intermediate storage */ 2883 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2884 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2885 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2886 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2887 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2888 } else { 2889 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2890 } 2891 } 2892 C->product->data = mmdata; 2893 C->product->destroy = MatDestroy_MatMatCusparse; 2894 2895 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2896 PetscFunctionReturn(PETSC_SUCCESS); 2897 } 2898 2899 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2900 { 2901 Mat_Product *product = C->product; 2902 Mat A, B; 2903 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2904 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2905 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2906 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2907 PetscBool flg; 2908 cusparseStatus_t stat; 2909 MatProductType ptype; 2910 MatMatCusparse *mmdata; 2911 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2912 cusparseSpMatDescr_t BmatSpDescr; 2913 #endif 2914 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2915 2916 PetscFunctionBegin; 2917 MatCheckProduct(C, 1); 2918 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2919 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2920 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2921 mmdata = (MatMatCusparse *)C->product->data; 2922 A = product->A; 2923 B = product->B; 2924 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2925 mmdata->reusesym = PETSC_FALSE; 2926 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2927 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2928 Cmat = Ccusp->mat; 2929 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2930 Ccsr = (CsrMatrix *)Cmat->mat; 2931 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2932 goto finalize; 2933 } 2934 if (!c->nz) goto finalize; 2935 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2936 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2937 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2938 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2939 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2940 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2941 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2942 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2943 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2944 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2945 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2946 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2947 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2948 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2949 2950 ptype = product->type; 2951 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2952 ptype = MATPRODUCT_AB; 2953 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2954 } 2955 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2956 ptype = MATPRODUCT_AB; 2957 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2958 } 2959 switch (ptype) { 2960 case MATPRODUCT_AB: 2961 Amat = Acusp->mat; 2962 Bmat = Bcusp->mat; 2963 break; 2964 case MATPRODUCT_AtB: 2965 Amat = Acusp->matTranspose; 2966 Bmat = Bcusp->mat; 2967 break; 2968 case MATPRODUCT_ABt: 2969 Amat = Acusp->mat; 2970 Bmat = Bcusp->matTranspose; 2971 break; 2972 default: 2973 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2974 } 2975 Cmat = Ccusp->mat; 2976 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2977 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2978 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2979 Acsr = (CsrMatrix *)Amat->mat; 2980 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2981 Ccsr = (CsrMatrix *)Cmat->mat; 2982 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2983 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2984 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2985 PetscCall(PetscLogGpuTimeBegin()); 2986 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2987 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2988 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2989 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2990 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2991 PetscCallCUSPARSE(stat); 2992 #else 2993 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2994 PetscCallCUSPARSE(stat); 2995 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2996 PetscCallCUSPARSE(stat); 2997 #endif 2998 #else 2999 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3000 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3001 PetscCallCUSPARSE(stat); 3002 #endif 3003 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3004 PetscCallCUDA(WaitForCUDA()); 3005 PetscCall(PetscLogGpuTimeEnd()); 3006 C->offloadmask = PETSC_OFFLOAD_GPU; 3007 finalize: 3008 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3009 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3010 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3011 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3012 c->reallocs = 0; 3013 C->info.mallocs += 0; 3014 C->info.nz_unneeded = 0; 3015 C->assembled = C->was_assembled = PETSC_TRUE; 3016 C->num_ass++; 3017 PetscFunctionReturn(PETSC_SUCCESS); 3018 } 3019 3020 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3021 { 3022 Mat_Product *product = C->product; 3023 Mat A, B; 3024 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3025 Mat_SeqAIJ *a, *b, *c; 3026 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3027 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3028 PetscInt i, j, m, n, k; 3029 PetscBool flg; 3030 cusparseStatus_t stat; 3031 MatProductType ptype; 3032 MatMatCusparse *mmdata; 3033 PetscLogDouble flops; 3034 PetscBool biscompressed, ciscompressed; 3035 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3036 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3037 cusparseSpMatDescr_t BmatSpDescr; 3038 #else 3039 int cnz; 3040 #endif 3041 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3042 3043 PetscFunctionBegin; 3044 MatCheckProduct(C, 1); 3045 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3046 A = product->A; 3047 B = product->B; 3048 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3049 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3050 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3051 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3052 a = (Mat_SeqAIJ *)A->data; 3053 b = (Mat_SeqAIJ *)B->data; 3054 /* product data */ 3055 PetscCall(PetscNew(&mmdata)); 3056 C->product->data = mmdata; 3057 C->product->destroy = MatDestroy_MatMatCusparse; 3058 3059 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3060 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3061 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3062 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3063 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3064 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3065 3066 ptype = product->type; 3067 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3068 ptype = MATPRODUCT_AB; 3069 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3070 } 3071 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3072 ptype = MATPRODUCT_AB; 3073 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3074 } 3075 biscompressed = PETSC_FALSE; 3076 ciscompressed = PETSC_FALSE; 3077 switch (ptype) { 3078 case MATPRODUCT_AB: 3079 m = A->rmap->n; 3080 n = B->cmap->n; 3081 k = A->cmap->n; 3082 Amat = Acusp->mat; 3083 Bmat = Bcusp->mat; 3084 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3085 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3086 break; 3087 case MATPRODUCT_AtB: 3088 m = A->cmap->n; 3089 n = B->cmap->n; 3090 k = A->rmap->n; 3091 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3092 Amat = Acusp->matTranspose; 3093 Bmat = Bcusp->mat; 3094 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3095 break; 3096 case MATPRODUCT_ABt: 3097 m = A->rmap->n; 3098 n = B->rmap->n; 3099 k = A->cmap->n; 3100 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3101 Amat = Acusp->mat; 3102 Bmat = Bcusp->matTranspose; 3103 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3104 break; 3105 default: 3106 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3107 } 3108 3109 /* create cusparse matrix */ 3110 PetscCall(MatSetSizes(C, m, n, m, n)); 3111 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3112 c = (Mat_SeqAIJ *)C->data; 3113 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3114 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3115 Ccsr = new CsrMatrix; 3116 3117 c->compressedrow.use = ciscompressed; 3118 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3119 c->compressedrow.nrows = a->compressedrow.nrows; 3120 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3121 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3122 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3123 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3124 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3125 } else { 3126 c->compressedrow.nrows = 0; 3127 c->compressedrow.i = NULL; 3128 c->compressedrow.rindex = NULL; 3129 Ccusp->workVector = NULL; 3130 Cmat->cprowIndices = NULL; 3131 } 3132 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3133 Ccusp->mat = Cmat; 3134 Ccusp->mat->mat = Ccsr; 3135 Ccsr->num_rows = Ccusp->nrows; 3136 Ccsr->num_cols = n; 3137 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3138 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3139 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3140 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3141 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3142 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3143 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3144 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3145 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3146 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3147 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3148 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3149 c->nz = 0; 3150 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3151 Ccsr->values = new THRUSTARRAY(c->nz); 3152 goto finalizesym; 3153 } 3154 3155 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3156 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3157 Acsr = (CsrMatrix *)Amat->mat; 3158 if (!biscompressed) { 3159 Bcsr = (CsrMatrix *)Bmat->mat; 3160 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3161 BmatSpDescr = Bmat->matDescr; 3162 #endif 3163 } else { /* we need to use row offsets for the full matrix */ 3164 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3165 Bcsr = new CsrMatrix; 3166 Bcsr->num_rows = B->rmap->n; 3167 Bcsr->num_cols = cBcsr->num_cols; 3168 Bcsr->num_entries = cBcsr->num_entries; 3169 Bcsr->column_indices = cBcsr->column_indices; 3170 Bcsr->values = cBcsr->values; 3171 if (!Bcusp->rowoffsets_gpu) { 3172 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3173 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3174 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3175 } 3176 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3177 mmdata->Bcsr = Bcsr; 3178 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3179 if (Bcsr->num_rows && Bcsr->num_cols) { 3180 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3181 PetscCallCUSPARSE(stat); 3182 } 3183 BmatSpDescr = mmdata->matSpBDescr; 3184 #endif 3185 } 3186 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3187 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3188 /* precompute flops count */ 3189 if (ptype == MATPRODUCT_AB) { 3190 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3191 const PetscInt st = a->i[i]; 3192 const PetscInt en = a->i[i + 1]; 3193 for (j = st; j < en; j++) { 3194 const PetscInt brow = a->j[j]; 3195 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3196 } 3197 } 3198 } else if (ptype == MATPRODUCT_AtB) { 3199 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3200 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3201 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3202 flops += (2. * anzi) * bnzi; 3203 } 3204 } else { /* TODO */ 3205 flops = 0.; 3206 } 3207 3208 mmdata->flops = flops; 3209 PetscCall(PetscLogGpuTimeBegin()); 3210 3211 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3212 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3213 // cuda-12.2 requires non-null csrRowOffsets 3214 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3215 PetscCallCUSPARSE(stat); 3216 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3217 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3218 { 3219 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3220 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3221 */ 3222 void *dBuffer1 = NULL; 3223 void *dBuffer2 = NULL; 3224 void *dBuffer3 = NULL; 3225 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3226 size_t bufferSize1 = 0; 3227 size_t bufferSize2 = 0; 3228 size_t bufferSize3 = 0; 3229 size_t bufferSize4 = 0; 3230 size_t bufferSize5 = 0; 3231 3232 /* ask bufferSize1 bytes for external memory */ 3233 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3234 PetscCallCUSPARSE(stat); 3235 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3236 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3237 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3238 PetscCallCUSPARSE(stat); 3239 3240 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3241 PetscCallCUSPARSE(stat); 3242 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3243 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3244 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3245 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3246 PetscCallCUSPARSE(stat); 3247 PetscCallCUDA(cudaFree(dBuffer1)); 3248 PetscCallCUDA(cudaFree(dBuffer2)); 3249 3250 /* get matrix C non-zero entries C_nnz1 */ 3251 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3252 c->nz = (PetscInt)C_nnz1; 3253 /* allocate matrix C */ 3254 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3255 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3256 Ccsr->values = new THRUSTARRAY(c->nz); 3257 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3258 /* update matC with the new pointers */ 3259 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3260 PetscCallCUSPARSE(stat); 3261 3262 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3263 PetscCallCUSPARSE(stat); 3264 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3265 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3266 PetscCallCUSPARSE(stat); 3267 PetscCallCUDA(cudaFree(dBuffer3)); 3268 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3269 PetscCallCUSPARSE(stat); 3270 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3271 } 3272 #else 3273 size_t bufSize2; 3274 /* ask bufferSize bytes for external memory */ 3275 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3276 PetscCallCUSPARSE(stat); 3277 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3278 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3279 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3280 PetscCallCUSPARSE(stat); 3281 /* ask bufferSize again bytes for external memory */ 3282 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3283 PetscCallCUSPARSE(stat); 3284 /* The CUSPARSE documentation is not clear, nor the API 3285 We need both buffers to perform the operations properly! 3286 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3287 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3288 is stored in the descriptor! What a messy API... */ 3289 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3290 /* compute the intermediate product of A * B */ 3291 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3292 PetscCallCUSPARSE(stat); 3293 /* get matrix C non-zero entries C_nnz1 */ 3294 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3295 c->nz = (PetscInt)C_nnz1; 3296 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3297 mmdata->mmBufferSize / 1024)); 3298 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3299 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3300 Ccsr->values = new THRUSTARRAY(c->nz); 3301 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3302 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3303 PetscCallCUSPARSE(stat); 3304 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3305 PetscCallCUSPARSE(stat); 3306 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3307 #else 3308 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3309 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3310 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3311 PetscCallCUSPARSE(stat); 3312 c->nz = cnz; 3313 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3314 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3315 Ccsr->values = new THRUSTARRAY(c->nz); 3316 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3317 3318 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3319 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3320 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3321 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3322 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3323 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3324 PetscCallCUSPARSE(stat); 3325 #endif 3326 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3327 PetscCall(PetscLogGpuTimeEnd()); 3328 finalizesym: 3329 c->free_a = PETSC_TRUE; 3330 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3331 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3332 c->free_ij = PETSC_TRUE; 3333 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3334 PetscInt *d_i = c->i; 3335 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3336 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3337 ii = *Ccsr->row_offsets; 3338 jj = *Ccsr->column_indices; 3339 if (ciscompressed) d_i = c->compressedrow.i; 3340 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3341 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3342 } else { 3343 PetscInt *d_i = c->i; 3344 if (ciscompressed) d_i = c->compressedrow.i; 3345 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3346 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3347 } 3348 if (ciscompressed) { /* need to expand host row offsets */ 3349 PetscInt r = 0; 3350 c->i[0] = 0; 3351 for (k = 0; k < c->compressedrow.nrows; k++) { 3352 const PetscInt next = c->compressedrow.rindex[k]; 3353 const PetscInt old = c->compressedrow.i[k]; 3354 for (; r < next; r++) c->i[r + 1] = old; 3355 } 3356 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3357 } 3358 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3359 PetscCall(PetscMalloc1(m, &c->ilen)); 3360 PetscCall(PetscMalloc1(m, &c->imax)); 3361 c->maxnz = c->nz; 3362 c->nonzerorowcnt = 0; 3363 c->rmax = 0; 3364 for (k = 0; k < m; k++) { 3365 const PetscInt nn = c->i[k + 1] - c->i[k]; 3366 c->ilen[k] = c->imax[k] = nn; 3367 c->nonzerorowcnt += (PetscInt)!!nn; 3368 c->rmax = PetscMax(c->rmax, nn); 3369 } 3370 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3371 PetscCall(PetscMalloc1(c->nz, &c->a)); 3372 Ccsr->num_entries = c->nz; 3373 3374 C->nonzerostate++; 3375 PetscCall(PetscLayoutSetUp(C->rmap)); 3376 PetscCall(PetscLayoutSetUp(C->cmap)); 3377 Ccusp->nonzerostate = C->nonzerostate; 3378 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3379 C->preallocated = PETSC_TRUE; 3380 C->assembled = PETSC_FALSE; 3381 C->was_assembled = PETSC_FALSE; 3382 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3383 mmdata->reusesym = PETSC_TRUE; 3384 C->offloadmask = PETSC_OFFLOAD_GPU; 3385 } 3386 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3387 PetscFunctionReturn(PETSC_SUCCESS); 3388 } 3389 3390 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3391 3392 /* handles sparse or dense B */ 3393 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3394 { 3395 Mat_Product *product = mat->product; 3396 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3397 3398 PetscFunctionBegin; 3399 MatCheckProduct(mat, 1); 3400 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3401 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3402 if (product->type == MATPRODUCT_ABC) { 3403 Ciscusp = PETSC_FALSE; 3404 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3405 } 3406 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3407 PetscBool usecpu = PETSC_FALSE; 3408 switch (product->type) { 3409 case MATPRODUCT_AB: 3410 if (product->api_user) { 3411 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3412 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3413 PetscOptionsEnd(); 3414 } else { 3415 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3416 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3417 PetscOptionsEnd(); 3418 } 3419 break; 3420 case MATPRODUCT_AtB: 3421 if (product->api_user) { 3422 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3423 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3424 PetscOptionsEnd(); 3425 } else { 3426 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3427 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3428 PetscOptionsEnd(); 3429 } 3430 break; 3431 case MATPRODUCT_PtAP: 3432 if (product->api_user) { 3433 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3434 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3435 PetscOptionsEnd(); 3436 } else { 3437 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3438 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3439 PetscOptionsEnd(); 3440 } 3441 break; 3442 case MATPRODUCT_RARt: 3443 if (product->api_user) { 3444 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3445 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3446 PetscOptionsEnd(); 3447 } else { 3448 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3449 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3450 PetscOptionsEnd(); 3451 } 3452 break; 3453 case MATPRODUCT_ABC: 3454 if (product->api_user) { 3455 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3456 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3457 PetscOptionsEnd(); 3458 } else { 3459 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3460 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3461 PetscOptionsEnd(); 3462 } 3463 break; 3464 default: 3465 break; 3466 } 3467 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3468 } 3469 /* dispatch */ 3470 if (isdense) { 3471 switch (product->type) { 3472 case MATPRODUCT_AB: 3473 case MATPRODUCT_AtB: 3474 case MATPRODUCT_ABt: 3475 case MATPRODUCT_PtAP: 3476 case MATPRODUCT_RARt: 3477 if (product->A->boundtocpu) { 3478 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3479 } else { 3480 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3481 } 3482 break; 3483 case MATPRODUCT_ABC: 3484 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3485 break; 3486 default: 3487 break; 3488 } 3489 } else if (Biscusp && Ciscusp) { 3490 switch (product->type) { 3491 case MATPRODUCT_AB: 3492 case MATPRODUCT_AtB: 3493 case MATPRODUCT_ABt: 3494 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3495 break; 3496 case MATPRODUCT_PtAP: 3497 case MATPRODUCT_RARt: 3498 case MATPRODUCT_ABC: 3499 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3500 break; 3501 default: 3502 break; 3503 } 3504 } else { /* fallback for AIJ */ 3505 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3506 } 3507 PetscFunctionReturn(PETSC_SUCCESS); 3508 } 3509 3510 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3511 { 3512 PetscFunctionBegin; 3513 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3514 PetscFunctionReturn(PETSC_SUCCESS); 3515 } 3516 3517 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3518 { 3519 PetscFunctionBegin; 3520 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3521 PetscFunctionReturn(PETSC_SUCCESS); 3522 } 3523 3524 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3525 { 3526 PetscFunctionBegin; 3527 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3528 PetscFunctionReturn(PETSC_SUCCESS); 3529 } 3530 3531 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3532 { 3533 PetscFunctionBegin; 3534 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3535 PetscFunctionReturn(PETSC_SUCCESS); 3536 } 3537 3538 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3539 { 3540 PetscFunctionBegin; 3541 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3542 PetscFunctionReturn(PETSC_SUCCESS); 3543 } 3544 3545 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3546 { 3547 int i = blockIdx.x * blockDim.x + threadIdx.x; 3548 if (i < n) y[idx[i]] += x[i]; 3549 } 3550 3551 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3552 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3553 { 3554 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3555 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3556 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3557 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3558 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3559 PetscBool compressed; 3560 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3561 PetscInt nx, ny; 3562 #endif 3563 3564 PetscFunctionBegin; 3565 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3566 if (!a->nz) { 3567 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3568 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3569 PetscFunctionReturn(PETSC_SUCCESS); 3570 } 3571 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3572 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3573 if (!trans) { 3574 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3575 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3576 } else { 3577 if (herm || !A->form_explicit_transpose) { 3578 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3579 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3580 } else { 3581 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3582 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3583 } 3584 } 3585 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3586 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3587 3588 try { 3589 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3590 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3591 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3592 3593 PetscCall(PetscLogGpuTimeBegin()); 3594 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3595 /* z = A x + beta y. 3596 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3597 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3598 */ 3599 xptr = xarray; 3600 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3601 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3602 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3603 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3604 allocated to accommodate different uses. So we get the length info directly from mat. 3605 */ 3606 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3607 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3608 nx = mat->num_cols; // since y = Ax 3609 ny = mat->num_rows; 3610 } 3611 #endif 3612 } else { 3613 /* z = A^T x + beta y 3614 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3615 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3616 */ 3617 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3618 dptr = zarray; 3619 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3620 if (compressed) { /* Scatter x to work vector */ 3621 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3622 3623 thrust::for_each( 3624 #if PetscDefined(HAVE_THRUST_ASYNC) 3625 thrust::cuda::par.on(PetscDefaultCudaStream), 3626 #endif 3627 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3628 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3629 } 3630 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3631 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3632 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3633 nx = mat->num_rows; // since y = A^T x 3634 ny = mat->num_cols; 3635 } 3636 #endif 3637 } 3638 3639 /* csr_spmv does y = alpha op(A) x + beta y */ 3640 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3641 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3642 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3643 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3644 #else 3645 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3646 #endif 3647 3648 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3649 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3650 if (!matDescr) { 3651 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3652 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3653 } 3654 #endif 3655 3656 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3657 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3658 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3659 PetscCallCUSPARSE( 3660 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3661 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3662 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3663 PetscCallCUSPARSE( 3664 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3665 #endif 3666 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3667 } else { 3668 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3669 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3670 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3671 } 3672 3673 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3674 #else 3675 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3676 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3677 #endif 3678 } else { 3679 if (cusparsestruct->nrows) { 3680 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3681 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3682 #else 3683 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3684 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3685 #endif 3686 } 3687 } 3688 PetscCall(PetscLogGpuTimeEnd()); 3689 3690 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3691 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3692 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3693 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3694 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3695 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3696 } 3697 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3698 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3699 } 3700 3701 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3702 if (compressed) { 3703 PetscCall(PetscLogGpuTimeBegin()); 3704 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3705 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3706 PetscCall(PetscLogGpuTimeEnd()); 3707 } 3708 } else { 3709 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3710 } 3711 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3712 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3713 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3714 } catch (char *ex) { 3715 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3716 } 3717 if (yy) { 3718 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3719 } else { 3720 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3721 } 3722 PetscFunctionReturn(PETSC_SUCCESS); 3723 } 3724 3725 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3726 { 3727 PetscFunctionBegin; 3728 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3729 PetscFunctionReturn(PETSC_SUCCESS); 3730 } 3731 3732 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3733 { 3734 PetscFunctionBegin; 3735 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3736 PetscFunctionReturn(PETSC_SUCCESS); 3737 } 3738 3739 /*@ 3740 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3741 (the default parallel PETSc format). 3742 3743 Collective 3744 3745 Input Parameters: 3746 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3747 . m - number of rows 3748 . n - number of columns 3749 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3750 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3751 3752 Output Parameter: 3753 . A - the matrix 3754 3755 Level: intermediate 3756 3757 Notes: 3758 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3759 calculations. For good matrix assembly performance the user should preallocate the matrix 3760 storage by setting the parameter `nz` (or the array `nnz`). 3761 3762 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3763 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3764 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3765 3766 The AIJ format, also called 3767 compressed row storage, is fully compatible with standard Fortran 3768 storage. That is, the stored row and column indices can begin at 3769 either one (as in Fortran) or zero. 3770 3771 Specify the preallocated storage with either nz or nnz (not both). 3772 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3773 allocation. 3774 3775 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 3776 @*/ 3777 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3778 { 3779 PetscFunctionBegin; 3780 PetscCall(MatCreate(comm, A)); 3781 PetscCall(MatSetSizes(*A, m, n, m, n)); 3782 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3783 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3784 PetscFunctionReturn(PETSC_SUCCESS); 3785 } 3786 3787 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3788 { 3789 PetscFunctionBegin; 3790 if (A->factortype == MAT_FACTOR_NONE) { 3791 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3792 } else { 3793 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3794 } 3795 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3796 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3797 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3798 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3799 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3800 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3801 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3802 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3803 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3804 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3805 PetscCall(MatDestroy_SeqAIJ(A)); 3806 PetscFunctionReturn(PETSC_SUCCESS); 3807 } 3808 3809 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3810 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3811 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3812 { 3813 PetscFunctionBegin; 3814 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3815 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3816 PetscFunctionReturn(PETSC_SUCCESS); 3817 } 3818 3819 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3820 { 3821 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3822 Mat_SeqAIJCUSPARSE *cy; 3823 Mat_SeqAIJCUSPARSE *cx; 3824 PetscScalar *ay; 3825 const PetscScalar *ax; 3826 CsrMatrix *csry, *csrx; 3827 3828 PetscFunctionBegin; 3829 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3830 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3831 if (X->ops->axpy != Y->ops->axpy) { 3832 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3833 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3834 PetscFunctionReturn(PETSC_SUCCESS); 3835 } 3836 /* if we are here, it means both matrices are bound to GPU */ 3837 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3838 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3839 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3840 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3841 csry = (CsrMatrix *)cy->mat->mat; 3842 csrx = (CsrMatrix *)cx->mat->mat; 3843 /* see if we can turn this into a cublas axpy */ 3844 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3845 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3846 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3847 if (eq) str = SAME_NONZERO_PATTERN; 3848 } 3849 /* spgeam is buggy with one column */ 3850 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3851 3852 if (str == SUBSET_NONZERO_PATTERN) { 3853 PetscScalar b = 1.0; 3854 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3855 size_t bufferSize; 3856 void *buffer; 3857 #endif 3858 3859 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3860 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3861 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3862 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3863 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3864 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3865 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3866 PetscCall(PetscLogGpuTimeBegin()); 3867 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3868 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3869 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3870 PetscCall(PetscLogGpuTimeEnd()); 3871 PetscCallCUDA(cudaFree(buffer)); 3872 #else 3873 PetscCall(PetscLogGpuTimeBegin()); 3874 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3875 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3876 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3877 PetscCall(PetscLogGpuTimeEnd()); 3878 #endif 3879 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3880 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3881 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3882 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3883 } else if (str == SAME_NONZERO_PATTERN) { 3884 cublasHandle_t cublasv2handle; 3885 PetscBLASInt one = 1, bnz = 1; 3886 3887 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3888 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3889 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3890 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3891 PetscCall(PetscLogGpuTimeBegin()); 3892 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3893 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3894 PetscCall(PetscLogGpuTimeEnd()); 3895 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3896 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3897 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3898 } else { 3899 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3900 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3901 } 3902 PetscFunctionReturn(PETSC_SUCCESS); 3903 } 3904 3905 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3906 { 3907 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3908 PetscScalar *ay; 3909 cublasHandle_t cublasv2handle; 3910 PetscBLASInt one = 1, bnz = 1; 3911 3912 PetscFunctionBegin; 3913 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3914 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3915 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3916 PetscCall(PetscLogGpuTimeBegin()); 3917 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3918 PetscCall(PetscLogGpuFlops(bnz)); 3919 PetscCall(PetscLogGpuTimeEnd()); 3920 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3921 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3922 PetscFunctionReturn(PETSC_SUCCESS); 3923 } 3924 3925 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3926 { 3927 PetscBool both = PETSC_FALSE; 3928 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3929 3930 PetscFunctionBegin; 3931 if (A->factortype == MAT_FACTOR_NONE) { 3932 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3933 if (spptr->mat) { 3934 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3935 if (matrix->values) { 3936 both = PETSC_TRUE; 3937 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3938 } 3939 } 3940 if (spptr->matTranspose) { 3941 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3942 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3943 } 3944 } 3945 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3946 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3947 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3948 else A->offloadmask = PETSC_OFFLOAD_CPU; 3949 PetscFunctionReturn(PETSC_SUCCESS); 3950 } 3951 3952 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m) 3953 { 3954 PetscFunctionBegin; 3955 *m = PETSC_MEMTYPE_CUDA; 3956 PetscFunctionReturn(PETSC_SUCCESS); 3957 } 3958 3959 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3960 { 3961 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3962 3963 PetscFunctionBegin; 3964 if (A->factortype != MAT_FACTOR_NONE) { 3965 A->boundtocpu = flg; 3966 PetscFunctionReturn(PETSC_SUCCESS); 3967 } 3968 if (flg) { 3969 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3970 3971 A->ops->scale = MatScale_SeqAIJ; 3972 A->ops->axpy = MatAXPY_SeqAIJ; 3973 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3974 A->ops->mult = MatMult_SeqAIJ; 3975 A->ops->multadd = MatMultAdd_SeqAIJ; 3976 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3977 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3978 A->ops->multhermitiantranspose = NULL; 3979 A->ops->multhermitiantransposeadd = NULL; 3980 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3981 A->ops->getcurrentmemtype = NULL; 3982 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3983 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3984 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3985 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3986 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3987 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3988 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3989 } else { 3990 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3991 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3992 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3993 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3994 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3995 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3996 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3997 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3998 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3999 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4000 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4001 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4002 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4003 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4004 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4005 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4006 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4007 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4008 4009 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4010 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4011 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4012 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4013 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4014 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4015 } 4016 A->boundtocpu = flg; 4017 if (flg && a->inode.size_csr) { 4018 a->inode.use = PETSC_TRUE; 4019 } else { 4020 a->inode.use = PETSC_FALSE; 4021 } 4022 PetscFunctionReturn(PETSC_SUCCESS); 4023 } 4024 4025 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4026 { 4027 Mat B; 4028 4029 PetscFunctionBegin; 4030 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4031 if (reuse == MAT_INITIAL_MATRIX) { 4032 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4033 } else if (reuse == MAT_REUSE_MATRIX) { 4034 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4035 } 4036 B = *newmat; 4037 4038 PetscCall(PetscFree(B->defaultvectype)); 4039 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4040 4041 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4042 if (B->factortype == MAT_FACTOR_NONE) { 4043 Mat_SeqAIJCUSPARSE *spptr; 4044 PetscCall(PetscNew(&spptr)); 4045 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4046 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4047 spptr->format = MAT_CUSPARSE_CSR; 4048 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4049 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4050 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4051 #else 4052 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4053 #endif 4054 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4055 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4056 #endif 4057 B->spptr = spptr; 4058 } else { 4059 Mat_SeqAIJCUSPARSETriFactors *spptr; 4060 4061 PetscCall(PetscNew(&spptr)); 4062 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4063 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4064 B->spptr = spptr; 4065 } 4066 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4067 } 4068 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4069 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4070 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4071 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4072 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4073 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4074 B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4075 4076 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4077 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4078 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4079 #if defined(PETSC_HAVE_HYPRE) 4080 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4081 #endif 4082 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4083 PetscFunctionReturn(PETSC_SUCCESS); 4084 } 4085 4086 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4087 { 4088 PetscFunctionBegin; 4089 PetscCall(MatCreate_SeqAIJ(B)); 4090 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4091 PetscFunctionReturn(PETSC_SUCCESS); 4092 } 4093 4094 /*MC 4095 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4096 4097 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 4098 CSR, ELL, or Hybrid format. 4099 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4100 4101 Options Database Keys: 4102 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4103 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4104 Other options include ell (ellpack) or hyb (hybrid). 4105 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4106 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4107 4108 Level: beginner 4109 4110 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4111 M*/ 4112 4113 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4114 { 4115 PetscFunctionBegin; 4116 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4117 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4118 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4119 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4120 PetscFunctionReturn(PETSC_SUCCESS); 4121 } 4122 4123 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4124 { 4125 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4126 4127 PetscFunctionBegin; 4128 if (cusp) { 4129 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4130 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4131 delete cusp->workVector; 4132 delete cusp->rowoffsets_gpu; 4133 delete cusp->csr2csc_i; 4134 delete cusp->coords; 4135 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4136 PetscCall(PetscFree(mat->spptr)); 4137 } 4138 PetscFunctionReturn(PETSC_SUCCESS); 4139 } 4140 4141 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4142 { 4143 PetscFunctionBegin; 4144 if (*mat) { 4145 delete (*mat)->values; 4146 delete (*mat)->column_indices; 4147 delete (*mat)->row_offsets; 4148 delete *mat; 4149 *mat = 0; 4150 } 4151 PetscFunctionReturn(PETSC_SUCCESS); 4152 } 4153 4154 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4155 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4156 { 4157 PetscFunctionBegin; 4158 if (*trifactor) { 4159 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4160 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4161 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4162 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4163 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4164 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4165 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4166 #endif 4167 PetscCall(PetscFree(*trifactor)); 4168 } 4169 PetscFunctionReturn(PETSC_SUCCESS); 4170 } 4171 #endif 4172 4173 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4174 { 4175 CsrMatrix *mat; 4176 4177 PetscFunctionBegin; 4178 if (*matstruct) { 4179 if ((*matstruct)->mat) { 4180 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4181 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4182 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4183 #else 4184 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4185 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4186 #endif 4187 } else { 4188 mat = (CsrMatrix *)(*matstruct)->mat; 4189 PetscCall(CsrMatrix_Destroy(&mat)); 4190 } 4191 } 4192 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4193 delete (*matstruct)->cprowIndices; 4194 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4195 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4196 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4197 4198 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4199 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4200 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4201 4202 for (int i = 0; i < 3; i++) { 4203 if (mdata->cuSpMV[i].initialized) { 4204 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4205 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4206 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4207 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4208 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4209 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4210 #endif 4211 } 4212 } 4213 #endif 4214 delete *matstruct; 4215 *matstruct = NULL; 4216 } 4217 PetscFunctionReturn(PETSC_SUCCESS); 4218 } 4219 4220 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4221 { 4222 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4223 4224 PetscFunctionBegin; 4225 if (fs) { 4226 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4227 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4228 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4229 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4230 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4231 delete fs->workVector; 4232 fs->workVector = NULL; 4233 #endif 4234 delete fs->rpermIndices; 4235 delete fs->cpermIndices; 4236 fs->rpermIndices = NULL; 4237 fs->cpermIndices = NULL; 4238 fs->init_dev_prop = PETSC_FALSE; 4239 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4240 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4241 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4242 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4243 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4244 PetscCallCUDA(cudaFree(fs->csrVal)); 4245 PetscCallCUDA(cudaFree(fs->diag)); 4246 PetscCallCUDA(cudaFree(fs->X)); 4247 PetscCallCUDA(cudaFree(fs->Y)); 4248 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4249 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4250 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4251 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4252 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4253 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4254 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4255 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4256 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4257 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4258 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4259 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4260 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4261 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4262 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4263 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4264 PetscCall(PetscFree(fs->csrRowPtr_h)); 4265 PetscCall(PetscFree(fs->csrVal_h)); 4266 PetscCall(PetscFree(fs->diag_h)); 4267 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4268 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4269 #endif 4270 } 4271 PetscFunctionReturn(PETSC_SUCCESS); 4272 } 4273 4274 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4275 { 4276 PetscFunctionBegin; 4277 if (*trifactors) { 4278 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4279 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4280 PetscCall(PetscFree(*trifactors)); 4281 } 4282 PetscFunctionReturn(PETSC_SUCCESS); 4283 } 4284 4285 struct IJCompare { 4286 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4287 { 4288 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4289 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4290 return false; 4291 } 4292 }; 4293 4294 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4295 { 4296 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4297 4298 PetscFunctionBegin; 4299 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4300 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4301 if (destroy) { 4302 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4303 delete cusp->csr2csc_i; 4304 cusp->csr2csc_i = NULL; 4305 } 4306 A->transupdated = PETSC_FALSE; 4307 PetscFunctionReturn(PETSC_SUCCESS); 4308 } 4309 4310 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4311 { 4312 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 4313 4314 PetscFunctionBegin; 4315 PetscCallCUDA(cudaFree(coo->perm)); 4316 PetscCallCUDA(cudaFree(coo->jmap)); 4317 PetscCall(PetscFree(coo)); 4318 PetscFunctionReturn(PETSC_SUCCESS); 4319 } 4320 4321 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4322 { 4323 PetscBool dev_ij = PETSC_FALSE; 4324 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4325 PetscInt *i, *j; 4326 PetscContainer container_h; 4327 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4328 4329 PetscFunctionBegin; 4330 PetscCall(PetscGetMemType(coo_i, &mtype)); 4331 if (PetscMemTypeDevice(mtype)) { 4332 dev_ij = PETSC_TRUE; 4333 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4334 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4335 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4336 } else { 4337 i = coo_i; 4338 j = coo_j; 4339 } 4340 4341 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4342 if (dev_ij) PetscCall(PetscFree2(i, j)); 4343 mat->offloadmask = PETSC_OFFLOAD_CPU; 4344 // Create the GPU memory 4345 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4346 4347 // Copy the COO struct to device 4348 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4349 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4350 PetscCall(PetscMalloc1(1, &coo_d)); 4351 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4352 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4353 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4354 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4355 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4356 4357 // Put the COO struct in a container and then attach that to the matrix 4358 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4359 PetscFunctionReturn(PETSC_SUCCESS); 4360 } 4361 4362 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4363 { 4364 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4365 const PetscCount grid_size = gridDim.x * blockDim.x; 4366 for (; i < nnz; i += grid_size) { 4367 PetscScalar sum = 0.0; 4368 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4369 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4370 } 4371 } 4372 4373 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4374 { 4375 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4376 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4377 PetscCount Annz = seq->nz; 4378 PetscMemType memtype; 4379 const PetscScalar *v1 = v; 4380 PetscScalar *Aa; 4381 PetscContainer container; 4382 MatCOOStruct_SeqAIJ *coo; 4383 4384 PetscFunctionBegin; 4385 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4386 4387 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4388 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4389 4390 PetscCall(PetscGetMemType(v, &memtype)); 4391 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4392 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4393 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4394 } 4395 4396 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4397 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4398 4399 PetscCall(PetscLogGpuTimeBegin()); 4400 if (Annz) { 4401 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4402 PetscCallCUDA(cudaPeekAtLastError()); 4403 } 4404 PetscCall(PetscLogGpuTimeEnd()); 4405 4406 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4407 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4408 4409 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4410 PetscFunctionReturn(PETSC_SUCCESS); 4411 } 4412 4413 /*@C 4414 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4415 4416 Not Collective 4417 4418 Input Parameters: 4419 + A - the matrix 4420 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4421 4422 Output Parameters: 4423 + i - the CSR row pointers 4424 - j - the CSR column indices 4425 4426 Level: developer 4427 4428 Note: 4429 When compressed is true, the CSR structure does not contain empty rows 4430 4431 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4432 @*/ 4433 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4434 { 4435 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4436 CsrMatrix *csr; 4437 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4438 4439 PetscFunctionBegin; 4440 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4441 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4442 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4443 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4444 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4445 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4446 csr = (CsrMatrix *)cusp->mat->mat; 4447 if (i) { 4448 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4449 if (!cusp->rowoffsets_gpu) { 4450 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4451 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4452 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4453 } 4454 *i = cusp->rowoffsets_gpu->data().get(); 4455 } else *i = csr->row_offsets->data().get(); 4456 } 4457 if (j) *j = csr->column_indices->data().get(); 4458 PetscFunctionReturn(PETSC_SUCCESS); 4459 } 4460 4461 /*@C 4462 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4463 4464 Not Collective 4465 4466 Input Parameters: 4467 + A - the matrix 4468 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4469 . i - the CSR row pointers 4470 - j - the CSR column indices 4471 4472 Level: developer 4473 4474 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4475 @*/ 4476 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4477 { 4478 PetscFunctionBegin; 4479 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4480 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4481 if (i) *i = NULL; 4482 if (j) *j = NULL; 4483 (void)compressed; 4484 PetscFunctionReturn(PETSC_SUCCESS); 4485 } 4486 4487 /*@C 4488 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4489 4490 Not Collective 4491 4492 Input Parameter: 4493 . A - a `MATSEQAIJCUSPARSE` matrix 4494 4495 Output Parameter: 4496 . a - pointer to the device data 4497 4498 Level: developer 4499 4500 Note: 4501 May trigger host-device copies if up-to-date matrix data is on host 4502 4503 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4504 @*/ 4505 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4506 { 4507 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4508 CsrMatrix *csr; 4509 4510 PetscFunctionBegin; 4511 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4512 PetscAssertPointer(a, 2); 4513 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4514 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4515 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4516 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4517 csr = (CsrMatrix *)cusp->mat->mat; 4518 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4519 *a = csr->values->data().get(); 4520 PetscFunctionReturn(PETSC_SUCCESS); 4521 } 4522 4523 /*@C 4524 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4525 4526 Not Collective 4527 4528 Input Parameters: 4529 + A - a `MATSEQAIJCUSPARSE` matrix 4530 - a - pointer to the device data 4531 4532 Level: developer 4533 4534 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4535 @*/ 4536 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4537 { 4538 PetscFunctionBegin; 4539 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4540 PetscAssertPointer(a, 2); 4541 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4542 *a = NULL; 4543 PetscFunctionReturn(PETSC_SUCCESS); 4544 } 4545 4546 /*@C 4547 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4548 4549 Not Collective 4550 4551 Input Parameter: 4552 . A - a `MATSEQAIJCUSPARSE` matrix 4553 4554 Output Parameter: 4555 . a - pointer to the device data 4556 4557 Level: developer 4558 4559 Note: 4560 May trigger host-device copies if up-to-date matrix data is on host 4561 4562 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4563 @*/ 4564 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4565 { 4566 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4567 CsrMatrix *csr; 4568 4569 PetscFunctionBegin; 4570 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4571 PetscAssertPointer(a, 2); 4572 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4573 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4574 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4575 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4576 csr = (CsrMatrix *)cusp->mat->mat; 4577 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4578 *a = csr->values->data().get(); 4579 A->offloadmask = PETSC_OFFLOAD_GPU; 4580 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4581 PetscFunctionReturn(PETSC_SUCCESS); 4582 } 4583 /*@C 4584 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4585 4586 Not Collective 4587 4588 Input Parameters: 4589 + A - a `MATSEQAIJCUSPARSE` matrix 4590 - a - pointer to the device data 4591 4592 Level: developer 4593 4594 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4595 @*/ 4596 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4597 { 4598 PetscFunctionBegin; 4599 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4600 PetscAssertPointer(a, 2); 4601 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4602 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4603 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4604 *a = NULL; 4605 PetscFunctionReturn(PETSC_SUCCESS); 4606 } 4607 4608 /*@C 4609 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4610 4611 Not Collective 4612 4613 Input Parameter: 4614 . A - a `MATSEQAIJCUSPARSE` matrix 4615 4616 Output Parameter: 4617 . a - pointer to the device data 4618 4619 Level: developer 4620 4621 Note: 4622 Does not trigger host-device copies and flags data validity on the GPU 4623 4624 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4625 @*/ 4626 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4627 { 4628 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4629 CsrMatrix *csr; 4630 4631 PetscFunctionBegin; 4632 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4633 PetscAssertPointer(a, 2); 4634 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4635 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4636 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4637 csr = (CsrMatrix *)cusp->mat->mat; 4638 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4639 *a = csr->values->data().get(); 4640 A->offloadmask = PETSC_OFFLOAD_GPU; 4641 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4642 PetscFunctionReturn(PETSC_SUCCESS); 4643 } 4644 4645 /*@C 4646 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4647 4648 Not Collective 4649 4650 Input Parameters: 4651 + A - a `MATSEQAIJCUSPARSE` matrix 4652 - a - pointer to the device data 4653 4654 Level: developer 4655 4656 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4657 @*/ 4658 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4659 { 4660 PetscFunctionBegin; 4661 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4662 PetscAssertPointer(a, 2); 4663 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4664 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4665 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4666 *a = NULL; 4667 PetscFunctionReturn(PETSC_SUCCESS); 4668 } 4669 4670 struct IJCompare4 { 4671 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4672 { 4673 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4674 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4675 return false; 4676 } 4677 }; 4678 4679 struct Shift { 4680 int _shift; 4681 4682 Shift(int shift) : _shift(shift) { } 4683 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4684 }; 4685 4686 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4687 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4688 { 4689 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4690 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4691 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4692 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4693 PetscInt Annz, Bnnz; 4694 cusparseStatus_t stat; 4695 PetscInt i, m, n, zero = 0; 4696 4697 PetscFunctionBegin; 4698 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4699 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4700 PetscAssertPointer(C, 4); 4701 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4702 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4703 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4704 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4705 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4706 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4707 if (reuse == MAT_INITIAL_MATRIX) { 4708 m = A->rmap->n; 4709 n = A->cmap->n + B->cmap->n; 4710 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4711 PetscCall(MatSetSizes(*C, m, n, m, n)); 4712 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4713 c = (Mat_SeqAIJ *)(*C)->data; 4714 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4715 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4716 Ccsr = new CsrMatrix; 4717 Cmat->cprowIndices = NULL; 4718 c->compressedrow.use = PETSC_FALSE; 4719 c->compressedrow.nrows = 0; 4720 c->compressedrow.i = NULL; 4721 c->compressedrow.rindex = NULL; 4722 Ccusp->workVector = NULL; 4723 Ccusp->nrows = m; 4724 Ccusp->mat = Cmat; 4725 Ccusp->mat->mat = Ccsr; 4726 Ccsr->num_rows = m; 4727 Ccsr->num_cols = n; 4728 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4729 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4730 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4731 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4732 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4733 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4734 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4735 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4736 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4737 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4738 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4739 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4740 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4741 4742 Acsr = (CsrMatrix *)Acusp->mat->mat; 4743 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4744 Annz = (PetscInt)Acsr->column_indices->size(); 4745 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4746 c->nz = Annz + Bnnz; 4747 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4748 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4749 Ccsr->values = new THRUSTARRAY(c->nz); 4750 Ccsr->num_entries = c->nz; 4751 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4752 if (c->nz) { 4753 auto Acoo = new THRUSTINTARRAY32(Annz); 4754 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4755 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4756 THRUSTINTARRAY32 *Aroff, *Broff; 4757 4758 if (a->compressedrow.use) { /* need full row offset */ 4759 if (!Acusp->rowoffsets_gpu) { 4760 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4761 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4762 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4763 } 4764 Aroff = Acusp->rowoffsets_gpu; 4765 } else Aroff = Acsr->row_offsets; 4766 if (b->compressedrow.use) { /* need full row offset */ 4767 if (!Bcusp->rowoffsets_gpu) { 4768 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4769 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4770 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4771 } 4772 Broff = Bcusp->rowoffsets_gpu; 4773 } else Broff = Bcsr->row_offsets; 4774 PetscCall(PetscLogGpuTimeBegin()); 4775 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4776 PetscCallCUSPARSE(stat); 4777 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4778 PetscCallCUSPARSE(stat); 4779 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4780 auto Aperm = thrust::make_constant_iterator(1); 4781 auto Bperm = thrust::make_constant_iterator(0); 4782 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4783 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4784 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4785 #else 4786 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4787 auto Bcib = Bcsr->column_indices->begin(); 4788 auto Bcie = Bcsr->column_indices->end(); 4789 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4790 #endif 4791 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4792 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4793 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4794 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4795 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4796 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4797 auto p1 = Ccusp->coords->begin(); 4798 auto p2 = Ccusp->coords->begin(); 4799 thrust::advance(p2, Annz); 4800 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4801 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4802 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4803 #endif 4804 auto cci = thrust::make_counting_iterator(zero); 4805 auto cce = thrust::make_counting_iterator(c->nz); 4806 #if 0 //Errors on SUMMIT cuda 11.1.0 4807 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4808 #else 4809 #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST) 4810 auto pred = thrust::identity<int>(); 4811 #else 4812 auto pred = cuda::std::identity(); 4813 #endif 4814 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4815 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4816 #endif 4817 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4818 PetscCallCUSPARSE(stat); 4819 PetscCall(PetscLogGpuTimeEnd()); 4820 delete wPerm; 4821 delete Acoo; 4822 delete Bcoo; 4823 delete Ccoo; 4824 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4825 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4826 PetscCallCUSPARSE(stat); 4827 #endif 4828 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4829 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4830 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4831 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4832 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4833 CsrMatrix *CcsrT = new CsrMatrix; 4834 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4835 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4836 4837 (*C)->form_explicit_transpose = PETSC_TRUE; 4838 (*C)->transupdated = PETSC_TRUE; 4839 Ccusp->rowoffsets_gpu = NULL; 4840 CmatT->cprowIndices = NULL; 4841 CmatT->mat = CcsrT; 4842 CcsrT->num_rows = n; 4843 CcsrT->num_cols = m; 4844 CcsrT->num_entries = c->nz; 4845 4846 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4847 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4848 CcsrT->values = new THRUSTARRAY(c->nz); 4849 4850 PetscCall(PetscLogGpuTimeBegin()); 4851 auto rT = CcsrT->row_offsets->begin(); 4852 if (AT) { 4853 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4854 thrust::advance(rT, -1); 4855 } 4856 if (BT) { 4857 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4858 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4859 thrust::copy(titb, tite, rT); 4860 } 4861 auto cT = CcsrT->column_indices->begin(); 4862 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4863 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4864 auto vT = CcsrT->values->begin(); 4865 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4866 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4867 PetscCall(PetscLogGpuTimeEnd()); 4868 4869 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4870 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4871 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4872 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4873 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4874 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4875 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4876 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4877 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4878 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4879 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4880 PetscCallCUSPARSE(stat); 4881 #endif 4882 Ccusp->matTranspose = CmatT; 4883 } 4884 } 4885 4886 c->free_a = PETSC_TRUE; 4887 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4888 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4889 c->free_ij = PETSC_TRUE; 4890 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4891 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4892 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4893 ii = *Ccsr->row_offsets; 4894 jj = *Ccsr->column_indices; 4895 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4896 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4897 } else { 4898 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4899 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4900 } 4901 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4902 PetscCall(PetscMalloc1(m, &c->ilen)); 4903 PetscCall(PetscMalloc1(m, &c->imax)); 4904 c->maxnz = c->nz; 4905 c->nonzerorowcnt = 0; 4906 c->rmax = 0; 4907 for (i = 0; i < m; i++) { 4908 const PetscInt nn = c->i[i + 1] - c->i[i]; 4909 c->ilen[i] = c->imax[i] = nn; 4910 c->nonzerorowcnt += (PetscInt)!!nn; 4911 c->rmax = PetscMax(c->rmax, nn); 4912 } 4913 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4914 PetscCall(PetscMalloc1(c->nz, &c->a)); 4915 (*C)->nonzerostate++; 4916 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4917 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4918 Ccusp->nonzerostate = (*C)->nonzerostate; 4919 (*C)->preallocated = PETSC_TRUE; 4920 } else { 4921 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4922 c = (Mat_SeqAIJ *)(*C)->data; 4923 if (c->nz) { 4924 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4925 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4926 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4927 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4928 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4929 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4930 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4931 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4932 Acsr = (CsrMatrix *)Acusp->mat->mat; 4933 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4934 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4935 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4936 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4937 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4938 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4939 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4940 auto pmid = Ccusp->coords->begin(); 4941 thrust::advance(pmid, Acsr->num_entries); 4942 PetscCall(PetscLogGpuTimeBegin()); 4943 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4944 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4945 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4946 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4947 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4948 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4949 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4950 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4951 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4952 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4953 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4954 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4955 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4956 auto vT = CcsrT->values->begin(); 4957 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4958 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4959 (*C)->transupdated = PETSC_TRUE; 4960 } 4961 PetscCall(PetscLogGpuTimeEnd()); 4962 } 4963 } 4964 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4965 (*C)->assembled = PETSC_TRUE; 4966 (*C)->was_assembled = PETSC_FALSE; 4967 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4968 PetscFunctionReturn(PETSC_SUCCESS); 4969 } 4970 4971 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4972 { 4973 bool dmem; 4974 const PetscScalar *av; 4975 4976 PetscFunctionBegin; 4977 dmem = isCudaMem(v); 4978 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4979 if (n && idx) { 4980 THRUSTINTARRAY widx(n); 4981 widx.assign(idx, idx + n); 4982 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4983 4984 THRUSTARRAY *w = NULL; 4985 thrust::device_ptr<PetscScalar> dv; 4986 if (dmem) { 4987 dv = thrust::device_pointer_cast(v); 4988 } else { 4989 w = new THRUSTARRAY(n); 4990 dv = w->data(); 4991 } 4992 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4993 4994 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4995 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4996 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4997 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4998 delete w; 4999 } else { 5000 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5001 } 5002 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5003 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5004 PetscFunctionReturn(PETSC_SUCCESS); 5005 } 5006