1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #endif 19 #include <thrust/iterator/constant_iterator.h> 20 #include <thrust/remove.h> 21 #include <thrust/sort.h> 22 #include <thrust/unique.h> 23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST) 24 #include <cuda/std/functional> 25 #endif 26 27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 29 /* 30 The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 31 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 32 */ 33 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 34 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 36 #endif 37 38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 48 #endif 49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 59 60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 64 65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 67 68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 71 72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 73 { 74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 75 76 PetscFunctionBegin; 77 switch (op) { 78 case MAT_CUSPARSE_MULT: 79 cusparsestruct->format = format; 80 break; 81 case MAT_CUSPARSE_ALL: 82 cusparsestruct->format = format; 83 break; 84 default: 85 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 86 } 87 PetscFunctionReturn(PETSC_SUCCESS); 88 } 89 90 /*@ 91 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 92 operation. Only the `MatMult()` operation can use different GPU storage formats 93 94 Not Collective 95 96 Input Parameters: 97 + A - Matrix of type `MATSEQAIJCUSPARSE` 98 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 99 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 101 102 Level: intermediate 103 104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 105 @*/ 106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 107 { 108 PetscFunctionBegin; 109 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 110 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 117 118 PetscFunctionBegin; 119 cusparsestruct->use_cpu_solve = use_cpu; 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /*@ 124 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 125 126 Input Parameters: 127 + A - Matrix of type `MATSEQAIJCUSPARSE` 128 - use_cpu - set flag for using the built-in CPU `MatSolve()` 129 130 Level: intermediate 131 132 Note: 133 The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method 134 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there. 135 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 136 137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 138 @*/ 139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 140 { 141 PetscFunctionBegin; 142 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 143 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 148 { 149 PetscFunctionBegin; 150 switch (op) { 151 case MAT_FORM_EXPLICIT_TRANSPOSE: 152 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 153 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 154 A->form_explicit_transpose = flg; 155 break; 156 default: 157 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 158 break; 159 } 160 PetscFunctionReturn(PETSC_SUCCESS); 161 } 162 163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 164 { 165 MatCUSPARSEStorageFormat format; 166 PetscBool flg; 167 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 168 169 PetscFunctionBegin; 170 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 171 if (A->factortype == MAT_FACTOR_NONE) { 172 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 173 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 174 175 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 176 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 177 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 178 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 180 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 181 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 182 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 183 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 184 #else 185 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 186 #endif 187 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 188 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 189 190 PetscCall( 191 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 192 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 193 #endif 194 } 195 PetscOptionsHeadEnd(); 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 201 { 202 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 203 PetscInt m = A->rmap->n; 204 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 205 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 206 const MatScalar *Aa = a->a; 207 PetscInt *Mi, *Mj, Mnz; 208 PetscScalar *Ma; 209 210 PetscFunctionBegin; 211 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 212 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 213 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 214 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 215 PetscCall(PetscMalloc1(m + 1, &Mi)); 216 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 217 PetscCall(PetscMalloc1(Mnz, &Ma)); 218 Mi[0] = 0; 219 for (PetscInt i = 0; i < m; i++) { 220 PetscInt llen = Ai[i + 1] - Ai[i]; 221 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 222 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 223 Mj[Mi[i] + llen] = i; // diagonal entry 224 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 225 Mi[i + 1] = Mi[i] + llen + ulen; 226 } 227 // Copy M (L,U) from host to device 228 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 229 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 230 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 231 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 232 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 233 234 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 235 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 236 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 237 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 238 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 239 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 240 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 241 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 242 243 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 244 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 245 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 246 247 fillMode = CUSPARSE_FILL_MODE_UPPER; 248 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 249 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 250 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 251 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 252 253 // Allocate work vectors in SpSv 254 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 255 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 256 257 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 258 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 259 260 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 261 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 262 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 263 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 264 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 265 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 266 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 267 268 // Record for reuse 269 fs->csrRowPtr_h = Mi; 270 fs->csrVal_h = Ma; 271 PetscCall(PetscFree(Mj)); 272 } 273 // Copy the value 274 Mi = fs->csrRowPtr_h; 275 Ma = fs->csrVal_h; 276 Mnz = Mi[m]; 277 for (PetscInt i = 0; i < m; i++) { 278 PetscInt llen = Ai[i + 1] - Ai[i]; 279 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 280 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 281 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 282 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 283 } 284 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 285 286 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 287 if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 288 // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 289 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 290 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 291 } else 292 #endif 293 { 294 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 295 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 296 297 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 298 fs->updatedSpSVAnalysis = PETSC_TRUE; 299 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 300 } 301 } 302 PetscFunctionReturn(PETSC_SUCCESS); 303 } 304 #else 305 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 306 { 307 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 308 PetscInt n = A->rmap->n; 309 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 310 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 311 const PetscInt *ai = a->i, *aj = a->j, *vi; 312 const MatScalar *aa = a->a, *v; 313 PetscInt *AiLo, *AjLo; 314 PetscInt i, nz, nzLower, offset, rowOffset; 315 316 PetscFunctionBegin; 317 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 318 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 319 try { 320 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 321 nzLower = n + ai[n] - ai[1]; 322 if (!loTriFactor) { 323 PetscScalar *AALo; 324 325 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 326 327 /* Allocate Space for the lower triangular matrix */ 328 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 329 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 330 331 /* Fill the lower triangular matrix */ 332 AiLo[0] = (PetscInt)0; 333 AiLo[n] = nzLower; 334 AjLo[0] = (PetscInt)0; 335 AALo[0] = (MatScalar)1.0; 336 v = aa; 337 vi = aj; 338 offset = 1; 339 rowOffset = 1; 340 for (i = 1; i < n; i++) { 341 nz = ai[i + 1] - ai[i]; 342 /* additional 1 for the term on the diagonal */ 343 AiLo[i] = rowOffset; 344 rowOffset += nz + 1; 345 346 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 347 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 348 349 offset += nz; 350 AjLo[offset] = (PetscInt)i; 351 AALo[offset] = (MatScalar)1.0; 352 offset += 1; 353 354 v += nz; 355 vi += nz; 356 } 357 358 /* allocate space for the triangular factor information */ 359 PetscCall(PetscNew(&loTriFactor)); 360 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 361 /* Create the matrix description */ 362 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 363 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 364 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 365 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 366 #else 367 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 368 #endif 369 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 370 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 371 372 /* set the operation */ 373 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 374 375 /* set the matrix */ 376 loTriFactor->csrMat = new CsrMatrix; 377 loTriFactor->csrMat->num_rows = n; 378 loTriFactor->csrMat->num_cols = n; 379 loTriFactor->csrMat->num_entries = nzLower; 380 381 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 382 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 383 384 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 385 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 386 387 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 388 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 389 390 /* Create the solve analysis information */ 391 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 392 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 393 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 394 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 395 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 396 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 397 #endif 398 399 /* perform the solve analysis */ 400 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 401 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 402 PetscCallCUDA(WaitForCUDA()); 403 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 404 405 /* assign the pointer */ 406 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 407 loTriFactor->AA_h = AALo; 408 PetscCallCUDA(cudaFreeHost(AiLo)); 409 PetscCallCUDA(cudaFreeHost(AjLo)); 410 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 411 } else { /* update values only */ 412 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 413 /* Fill the lower triangular matrix */ 414 loTriFactor->AA_h[0] = 1.0; 415 v = aa; 416 vi = aj; 417 offset = 1; 418 for (i = 1; i < n; i++) { 419 nz = ai[i + 1] - ai[i]; 420 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 421 offset += nz; 422 loTriFactor->AA_h[offset] = 1.0; 423 offset += 1; 424 v += nz; 425 } 426 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 427 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 428 } 429 } catch (char *ex) { 430 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 431 } 432 } 433 PetscFunctionReturn(PETSC_SUCCESS); 434 } 435 436 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 437 { 438 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 439 PetscInt n = A->rmap->n; 440 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 441 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 442 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 443 const MatScalar *aa = a->a, *v; 444 PetscInt *AiUp, *AjUp; 445 PetscInt i, nz, nzUpper, offset; 446 447 PetscFunctionBegin; 448 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 449 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 450 try { 451 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 452 nzUpper = adiag[0] - adiag[n]; 453 if (!upTriFactor) { 454 PetscScalar *AAUp; 455 456 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 457 458 /* Allocate Space for the upper triangular matrix */ 459 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 460 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 461 462 /* Fill the upper triangular matrix */ 463 AiUp[0] = (PetscInt)0; 464 AiUp[n] = nzUpper; 465 offset = nzUpper; 466 for (i = n - 1; i >= 0; i--) { 467 v = aa + adiag[i + 1] + 1; 468 vi = aj + adiag[i + 1] + 1; 469 470 /* number of elements NOT on the diagonal */ 471 nz = adiag[i] - adiag[i + 1] - 1; 472 473 /* decrement the offset */ 474 offset -= (nz + 1); 475 476 /* first, set the diagonal elements */ 477 AjUp[offset] = (PetscInt)i; 478 AAUp[offset] = (MatScalar)1. / v[nz]; 479 AiUp[i] = AiUp[i + 1] - (nz + 1); 480 481 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 482 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 483 } 484 485 /* allocate space for the triangular factor information */ 486 PetscCall(PetscNew(&upTriFactor)); 487 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 488 489 /* Create the matrix description */ 490 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 491 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 492 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 493 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 494 #else 495 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 496 #endif 497 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 498 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 499 500 /* set the operation */ 501 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 502 503 /* set the matrix */ 504 upTriFactor->csrMat = new CsrMatrix; 505 upTriFactor->csrMat->num_rows = n; 506 upTriFactor->csrMat->num_cols = n; 507 upTriFactor->csrMat->num_entries = nzUpper; 508 509 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 510 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 511 512 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 513 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 514 515 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 516 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 517 518 /* Create the solve analysis information */ 519 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 520 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 521 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 522 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 523 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 524 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 525 #endif 526 527 /* perform the solve analysis */ 528 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 529 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 530 531 PetscCallCUDA(WaitForCUDA()); 532 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 533 534 /* assign the pointer */ 535 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 536 upTriFactor->AA_h = AAUp; 537 PetscCallCUDA(cudaFreeHost(AiUp)); 538 PetscCallCUDA(cudaFreeHost(AjUp)); 539 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 540 } else { 541 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 542 /* Fill the upper triangular matrix */ 543 offset = nzUpper; 544 for (i = n - 1; i >= 0; i--) { 545 v = aa + adiag[i + 1] + 1; 546 547 /* number of elements NOT on the diagonal */ 548 nz = adiag[i] - adiag[i + 1] - 1; 549 550 /* decrement the offset */ 551 offset -= (nz + 1); 552 553 /* first, set the diagonal elements */ 554 upTriFactor->AA_h[offset] = 1. / v[nz]; 555 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 556 } 557 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 558 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 559 } 560 } catch (char *ex) { 561 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 562 } 563 } 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 #endif 567 568 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 569 { 570 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 571 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 572 IS isrow = a->row, isicol = a->icol; 573 PetscBool row_identity, col_identity; 574 PetscInt n = A->rmap->n; 575 576 PetscFunctionBegin; 577 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 578 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 579 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 580 #else 581 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 582 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 583 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 584 #endif 585 586 cusparseTriFactors->nnz = a->nz; 587 588 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 589 /* lower triangular indices */ 590 PetscCall(ISIdentity(isrow, &row_identity)); 591 if (!row_identity && !cusparseTriFactors->rpermIndices) { 592 const PetscInt *r; 593 594 PetscCall(ISGetIndices(isrow, &r)); 595 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 596 cusparseTriFactors->rpermIndices->assign(r, r + n); 597 PetscCall(ISRestoreIndices(isrow, &r)); 598 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 599 } 600 601 /* upper triangular indices */ 602 PetscCall(ISIdentity(isicol, &col_identity)); 603 if (!col_identity && !cusparseTriFactors->cpermIndices) { 604 const PetscInt *c; 605 606 PetscCall(ISGetIndices(isicol, &c)); 607 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 608 cusparseTriFactors->cpermIndices->assign(c, c + n); 609 PetscCall(ISRestoreIndices(isicol, &c)); 610 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 611 } 612 PetscFunctionReturn(PETSC_SUCCESS); 613 } 614 615 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 616 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 617 { 618 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 619 PetscInt m = A->rmap->n; 620 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 621 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 622 const MatScalar *Aa = a->a; 623 PetscInt *Mj, Mnz; 624 PetscScalar *Ma, *D; 625 626 PetscFunctionBegin; 627 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 628 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 629 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 630 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 631 Mnz = Ai[m]; // Unz (with the unit diagonal) 632 PetscCall(PetscMalloc1(Mnz, &Ma)); 633 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 634 PetscCall(PetscMalloc1(m, &D)); // the diagonal 635 for (PetscInt i = 0; i < m; i++) { 636 PetscInt ulen = Ai[i + 1] - Ai[i]; 637 Mj[Ai[i]] = i; // diagonal entry 638 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 639 } 640 // Copy M (U) from host to device 641 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 642 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 643 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 644 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 645 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 646 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 647 648 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 649 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 650 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 651 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 652 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 653 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 654 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 655 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 656 657 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 658 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 659 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 660 661 // Allocate work vectors in SpSv 662 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 663 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 664 665 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 666 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 667 668 // Query buffer sizes for SpSV and then allocate buffers 669 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 670 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 671 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 672 673 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 674 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 675 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 676 677 // Record for reuse 678 fs->csrVal_h = Ma; 679 fs->diag_h = D; 680 PetscCall(PetscFree(Mj)); 681 } 682 // Copy the value 683 Ma = fs->csrVal_h; 684 D = fs->diag_h; 685 Mnz = Ai[m]; 686 for (PetscInt i = 0; i < m; i++) { 687 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 688 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 689 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 690 } 691 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 692 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 693 694 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 695 if (fs->updatedSpSVAnalysis) { 696 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 697 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 698 } else 699 #endif 700 { 701 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 702 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 703 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 704 fs->updatedSpSVAnalysis = PETSC_TRUE; 705 } 706 } 707 PetscFunctionReturn(PETSC_SUCCESS); 708 } 709 710 // Solve Ut D U x = b 711 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 712 { 713 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 714 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 715 const PetscScalar *barray; 716 PetscScalar *xarray; 717 thrust::device_ptr<const PetscScalar> bGPU; 718 thrust::device_ptr<PetscScalar> xGPU; 719 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 720 PetscInt m = A->rmap->n; 721 722 PetscFunctionBegin; 723 PetscCall(PetscLogGpuTimeBegin()); 724 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 725 PetscCall(VecCUDAGetArrayRead(b, &barray)); 726 xGPU = thrust::device_pointer_cast(xarray); 727 bGPU = thrust::device_pointer_cast(barray); 728 729 // Reorder b with the row permutation if needed, and wrap the result in fs->X 730 if (fs->rpermIndices) { 731 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 732 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 733 } else { 734 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 735 } 736 737 // Solve Ut Y = X 738 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 739 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 740 741 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 742 // It is basically a vector element-wise multiplication, but cublas does not have it! 743 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 744 745 // Solve U X = Y 746 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 747 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 748 } else { 749 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 750 } 751 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 752 753 // Reorder X with the column permutation if needed, and put the result back to x 754 if (fs->cpermIndices) { 755 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 756 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 757 } 758 759 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 760 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 761 PetscCall(PetscLogGpuTimeEnd()); 762 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 763 PetscFunctionReturn(PETSC_SUCCESS); 764 } 765 #else 766 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 767 { 768 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 769 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 770 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 771 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 772 PetscInt *AiUp, *AjUp; 773 PetscScalar *AAUp; 774 PetscScalar *AALo; 775 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 776 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 777 const PetscInt *ai = b->i, *aj = b->j, *vj; 778 const MatScalar *aa = b->a, *v; 779 780 PetscFunctionBegin; 781 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 782 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 783 try { 784 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 785 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 786 if (!upTriFactor && !loTriFactor) { 787 /* Allocate Space for the upper triangular matrix */ 788 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 789 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 790 791 /* Fill the upper triangular matrix */ 792 AiUp[0] = (PetscInt)0; 793 AiUp[n] = nzUpper; 794 offset = 0; 795 for (i = 0; i < n; i++) { 796 /* set the pointers */ 797 v = aa + ai[i]; 798 vj = aj + ai[i]; 799 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 800 801 /* first, set the diagonal elements */ 802 AjUp[offset] = (PetscInt)i; 803 AAUp[offset] = (MatScalar)1.0 / v[nz]; 804 AiUp[i] = offset; 805 AALo[offset] = (MatScalar)1.0 / v[nz]; 806 807 offset += 1; 808 if (nz > 0) { 809 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 810 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 811 for (j = offset; j < offset + nz; j++) { 812 AAUp[j] = -AAUp[j]; 813 AALo[j] = AAUp[j] / v[nz]; 814 } 815 offset += nz; 816 } 817 } 818 819 /* allocate space for the triangular factor information */ 820 PetscCall(PetscNew(&upTriFactor)); 821 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 822 823 /* Create the matrix description */ 824 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 825 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 826 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 827 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 828 #else 829 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 830 #endif 831 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 832 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 833 834 /* set the matrix */ 835 upTriFactor->csrMat = new CsrMatrix; 836 upTriFactor->csrMat->num_rows = A->rmap->n; 837 upTriFactor->csrMat->num_cols = A->cmap->n; 838 upTriFactor->csrMat->num_entries = a->nz; 839 840 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 841 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 842 843 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 844 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 845 846 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 847 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 848 849 /* set the operation */ 850 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 851 852 /* Create the solve analysis information */ 853 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 854 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 855 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 856 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 857 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 858 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 859 #endif 860 861 /* perform the solve analysis */ 862 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 863 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 864 865 PetscCallCUDA(WaitForCUDA()); 866 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 867 868 /* assign the pointer */ 869 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 870 871 /* allocate space for the triangular factor information */ 872 PetscCall(PetscNew(&loTriFactor)); 873 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 874 875 /* Create the matrix description */ 876 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 877 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 878 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 879 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 880 #else 881 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 882 #endif 883 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 884 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 885 886 /* set the operation */ 887 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 888 889 /* set the matrix */ 890 loTriFactor->csrMat = new CsrMatrix; 891 loTriFactor->csrMat->num_rows = A->rmap->n; 892 loTriFactor->csrMat->num_cols = A->cmap->n; 893 loTriFactor->csrMat->num_entries = a->nz; 894 895 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 896 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 897 898 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 899 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 900 901 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 902 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 903 904 /* Create the solve analysis information */ 905 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 906 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 907 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 908 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 909 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 910 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 911 #endif 912 913 /* perform the solve analysis */ 914 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 915 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 916 917 PetscCallCUDA(WaitForCUDA()); 918 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 919 920 /* assign the pointer */ 921 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 922 923 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 924 PetscCallCUDA(cudaFreeHost(AiUp)); 925 PetscCallCUDA(cudaFreeHost(AjUp)); 926 } else { 927 /* Fill the upper triangular matrix */ 928 offset = 0; 929 for (i = 0; i < n; i++) { 930 /* set the pointers */ 931 v = aa + ai[i]; 932 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 933 934 /* first, set the diagonal elements */ 935 AAUp[offset] = 1.0 / v[nz]; 936 AALo[offset] = 1.0 / v[nz]; 937 938 offset += 1; 939 if (nz > 0) { 940 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 941 for (j = offset; j < offset + nz; j++) { 942 AAUp[j] = -AAUp[j]; 943 AALo[j] = AAUp[j] / v[nz]; 944 } 945 offset += nz; 946 } 947 } 948 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 949 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 950 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 951 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 952 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 953 } 954 PetscCallCUDA(cudaFreeHost(AAUp)); 955 PetscCallCUDA(cudaFreeHost(AALo)); 956 } catch (char *ex) { 957 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 958 } 959 } 960 PetscFunctionReturn(PETSC_SUCCESS); 961 } 962 #endif 963 964 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 965 { 966 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 967 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 968 IS ip = a->row; 969 PetscBool perm_identity; 970 PetscInt n = A->rmap->n; 971 972 PetscFunctionBegin; 973 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 974 975 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 976 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 977 #else 978 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 979 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 980 #endif 981 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 982 983 A->offloadmask = PETSC_OFFLOAD_BOTH; 984 985 /* lower triangular indices */ 986 PetscCall(ISIdentity(ip, &perm_identity)); 987 if (!perm_identity) { 988 IS iip; 989 const PetscInt *irip, *rip; 990 991 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 992 PetscCall(ISGetIndices(iip, &irip)); 993 PetscCall(ISGetIndices(ip, &rip)); 994 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 995 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 996 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 997 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 998 PetscCall(ISRestoreIndices(iip, &irip)); 999 PetscCall(ISDestroy(&iip)); 1000 PetscCall(ISRestoreIndices(ip, &rip)); 1001 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1002 } 1003 PetscFunctionReturn(PETSC_SUCCESS); 1004 } 1005 1006 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1007 { 1008 PetscFunctionBegin; 1009 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1010 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1011 B->offloadmask = PETSC_OFFLOAD_CPU; 1012 1013 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1014 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1015 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1016 #else 1017 /* determine which version of MatSolve needs to be used. */ 1018 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1019 IS ip = b->row; 1020 PetscBool perm_identity; 1021 1022 PetscCall(ISIdentity(ip, &perm_identity)); 1023 if (perm_identity) { 1024 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1025 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1026 } else { 1027 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1028 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1029 } 1030 #endif 1031 B->ops->matsolve = NULL; 1032 B->ops->matsolvetranspose = NULL; 1033 1034 /* get the triangular factors */ 1035 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1036 PetscFunctionReturn(PETSC_SUCCESS); 1037 } 1038 1039 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1040 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1041 { 1042 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1043 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1044 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1045 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1046 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1047 cusparseIndexBase_t indexBase; 1048 cusparseMatrixType_t matrixType; 1049 cusparseFillMode_t fillMode; 1050 cusparseDiagType_t diagType; 1051 1052 PetscFunctionBegin; 1053 /* allocate space for the transpose of the lower triangular factor */ 1054 PetscCall(PetscNew(&loTriFactorT)); 1055 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1056 1057 /* set the matrix descriptors of the lower triangular factor */ 1058 matrixType = cusparseGetMatType(loTriFactor->descr); 1059 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1060 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1061 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1062 1063 /* Create the matrix description */ 1064 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1065 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1066 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1067 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1068 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1069 1070 /* set the operation */ 1071 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1072 1073 /* allocate GPU space for the CSC of the lower triangular factor*/ 1074 loTriFactorT->csrMat = new CsrMatrix; 1075 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1076 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1077 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1078 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1079 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1080 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1081 1082 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1083 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1084 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1085 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1086 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1087 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1088 #endif 1089 1090 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1091 { 1092 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1093 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1094 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1095 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1096 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1097 #else 1098 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1099 #endif 1100 PetscCallCUSPARSE(stat); 1101 } 1102 1103 PetscCallCUDA(WaitForCUDA()); 1104 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1105 1106 /* Create the solve analysis information */ 1107 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1108 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1109 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1110 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1111 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1112 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1113 #endif 1114 1115 /* perform the solve analysis */ 1116 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1117 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1118 1119 PetscCallCUDA(WaitForCUDA()); 1120 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1121 1122 /* assign the pointer */ 1123 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1124 1125 /*********************************************/ 1126 /* Now the Transpose of the Upper Tri Factor */ 1127 /*********************************************/ 1128 1129 /* allocate space for the transpose of the upper triangular factor */ 1130 PetscCall(PetscNew(&upTriFactorT)); 1131 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1132 1133 /* set the matrix descriptors of the upper triangular factor */ 1134 matrixType = cusparseGetMatType(upTriFactor->descr); 1135 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1136 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1137 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1138 1139 /* Create the matrix description */ 1140 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1141 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1142 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1143 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1144 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1145 1146 /* set the operation */ 1147 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1148 1149 /* allocate GPU space for the CSC of the upper triangular factor*/ 1150 upTriFactorT->csrMat = new CsrMatrix; 1151 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1152 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1153 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1154 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1155 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1156 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1157 1158 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1160 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1161 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1163 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1164 #endif 1165 1166 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1167 { 1168 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1169 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1170 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1172 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1173 #else 1174 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1175 #endif 1176 PetscCallCUSPARSE(stat); 1177 } 1178 1179 PetscCallCUDA(WaitForCUDA()); 1180 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1181 1182 /* Create the solve analysis information */ 1183 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1184 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1185 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1186 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1187 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1188 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1189 #endif 1190 1191 /* perform the solve analysis */ 1192 /* christ, would it have killed you to put this stuff in a function????????? */ 1193 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1194 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1195 1196 PetscCallCUDA(WaitForCUDA()); 1197 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1198 1199 /* assign the pointer */ 1200 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1201 PetscFunctionReturn(PETSC_SUCCESS); 1202 } 1203 #endif 1204 1205 struct PetscScalarToPetscInt { 1206 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1207 }; 1208 1209 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1210 { 1211 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1212 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1213 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1214 cusparseStatus_t stat; 1215 cusparseIndexBase_t indexBase; 1216 1217 PetscFunctionBegin; 1218 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1219 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1220 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1221 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1222 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1223 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1224 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1225 PetscCall(PetscLogGpuTimeBegin()); 1226 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1227 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1228 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1229 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1230 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1231 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1232 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1233 1234 /* set alpha and beta */ 1235 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1236 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1237 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1238 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1239 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1240 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1241 1242 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1243 CsrMatrix *matrixT = new CsrMatrix; 1244 matstructT->mat = matrixT; 1245 matrixT->num_rows = A->cmap->n; 1246 matrixT->num_cols = A->rmap->n; 1247 matrixT->num_entries = a->nz; 1248 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1249 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1250 matrixT->values = new THRUSTARRAY(a->nz); 1251 1252 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1253 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1254 1255 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1256 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1257 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1258 indexBase, cusparse_scalartype); 1259 PetscCallCUSPARSE(stat); 1260 #else 1261 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1262 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1263 1264 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1265 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1266 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1267 */ 1268 if (matrixT->num_entries) { 1269 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1270 PetscCallCUSPARSE(stat); 1271 1272 } else { 1273 matstructT->matDescr = NULL; 1274 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1275 } 1276 #endif 1277 #endif 1278 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1279 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1280 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1281 #else 1282 CsrMatrix *temp = new CsrMatrix; 1283 CsrMatrix *tempT = new CsrMatrix; 1284 /* First convert HYB to CSR */ 1285 temp->num_rows = A->rmap->n; 1286 temp->num_cols = A->cmap->n; 1287 temp->num_entries = a->nz; 1288 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1289 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1290 temp->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1293 PetscCallCUSPARSE(stat); 1294 1295 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1296 tempT->num_rows = A->rmap->n; 1297 tempT->num_cols = A->cmap->n; 1298 tempT->num_entries = a->nz; 1299 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1300 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1301 tempT->values = new THRUSTARRAY(a->nz); 1302 1303 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1304 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1305 PetscCallCUSPARSE(stat); 1306 1307 /* Last, convert CSC to HYB */ 1308 cusparseHybMat_t hybMat; 1309 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1310 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1311 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1312 PetscCallCUSPARSE(stat); 1313 1314 /* assign the pointer */ 1315 matstructT->mat = hybMat; 1316 A->transupdated = PETSC_TRUE; 1317 /* delete temporaries */ 1318 if (tempT) { 1319 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1320 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1321 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1322 delete (CsrMatrix *)tempT; 1323 } 1324 if (temp) { 1325 if (temp->values) delete (THRUSTARRAY *)temp->values; 1326 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1327 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1328 delete (CsrMatrix *)temp; 1329 } 1330 #endif 1331 } 1332 } 1333 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1334 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1335 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1336 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1337 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1338 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1339 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1340 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1341 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1342 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1343 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1344 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1345 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1346 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1347 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1348 } 1349 if (!cusparsestruct->csr2csc_i) { 1350 THRUSTARRAY csr2csc_a(matrix->num_entries); 1351 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1352 1353 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1354 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1355 void *csr2cscBuffer; 1356 size_t csr2cscBufferSize; 1357 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1358 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1359 PetscCallCUSPARSE(stat); 1360 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1361 #endif 1362 1363 if (matrix->num_entries) { 1364 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1365 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1366 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1367 1368 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1369 should be filled with indexBase. So I just take a shortcut here. 1370 */ 1371 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1372 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1373 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1374 PetscCallCUSPARSE(stat); 1375 #else 1376 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1377 PetscCallCUSPARSE(stat); 1378 #endif 1379 } else { 1380 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1381 } 1382 1383 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1384 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1385 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1386 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1387 #endif 1388 } 1389 PetscCallThrust( 1390 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1391 } 1392 PetscCall(PetscLogGpuTimeEnd()); 1393 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1394 /* the compressed row indices is not used for matTranspose */ 1395 matstructT->cprowIndices = NULL; 1396 /* assign the pointer */ 1397 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1398 A->transupdated = PETSC_TRUE; 1399 PetscFunctionReturn(PETSC_SUCCESS); 1400 } 1401 1402 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1403 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1404 { 1405 const PetscScalar *barray; 1406 PetscScalar *xarray; 1407 thrust::device_ptr<const PetscScalar> bGPU; 1408 thrust::device_ptr<PetscScalar> xGPU; 1409 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1410 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1411 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1412 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1413 PetscInt m = A->rmap->n; 1414 1415 PetscFunctionBegin; 1416 PetscCall(PetscLogGpuTimeBegin()); 1417 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1418 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1419 xGPU = thrust::device_pointer_cast(xarray); 1420 bGPU = thrust::device_pointer_cast(barray); 1421 1422 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1423 if (fs->rpermIndices) { 1424 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1425 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1426 } else { 1427 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1428 } 1429 1430 // Solve L Y = X 1431 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1432 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1433 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1434 1435 // Solve U X = Y 1436 if (fs->cpermIndices) { 1437 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1438 } else { 1439 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1440 } 1441 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1442 1443 // Reorder X with the column permutation if needed, and put the result back to x 1444 if (fs->cpermIndices) { 1445 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1446 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1447 } 1448 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1449 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1450 PetscCall(PetscLogGpuTimeEnd()); 1451 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1452 PetscFunctionReturn(PETSC_SUCCESS); 1453 } 1454 1455 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1456 { 1457 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1458 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1459 const PetscScalar *barray; 1460 PetscScalar *xarray; 1461 thrust::device_ptr<const PetscScalar> bGPU; 1462 thrust::device_ptr<PetscScalar> xGPU; 1463 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1464 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1465 PetscInt m = A->rmap->n; 1466 1467 PetscFunctionBegin; 1468 PetscCall(PetscLogGpuTimeBegin()); 1469 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1470 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1471 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1472 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1473 1474 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1475 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1476 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1477 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1478 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1479 } 1480 1481 if (!fs->updatedTransposeSpSVAnalysis) { 1482 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1483 1484 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1485 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1486 } 1487 1488 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1489 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1490 xGPU = thrust::device_pointer_cast(xarray); 1491 bGPU = thrust::device_pointer_cast(barray); 1492 1493 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1494 if (fs->rpermIndices) { 1495 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1496 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1497 } else { 1498 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1499 } 1500 1501 // Solve Ut Y = X 1502 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1503 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1504 1505 // Solve Lt X = Y 1506 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1507 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1508 } else { 1509 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1510 } 1511 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1512 1513 // Reorder X with the column permutation if needed, and put the result back to x 1514 if (fs->cpermIndices) { 1515 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1516 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1517 } 1518 1519 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1520 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1521 PetscCall(PetscLogGpuTimeEnd()); 1522 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1523 PetscFunctionReturn(PETSC_SUCCESS); 1524 } 1525 #else 1526 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1527 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1528 { 1529 PetscInt n = xx->map->n; 1530 const PetscScalar *barray; 1531 PetscScalar *xarray; 1532 thrust::device_ptr<const PetscScalar> bGPU; 1533 thrust::device_ptr<PetscScalar> xGPU; 1534 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1535 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1536 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1537 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1538 1539 PetscFunctionBegin; 1540 /* Analyze the matrix and create the transpose ... on the fly */ 1541 if (!loTriFactorT && !upTriFactorT) { 1542 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1543 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1544 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1545 } 1546 1547 /* Get the GPU pointers */ 1548 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1549 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1550 xGPU = thrust::device_pointer_cast(xarray); 1551 bGPU = thrust::device_pointer_cast(barray); 1552 1553 PetscCall(PetscLogGpuTimeBegin()); 1554 /* First, reorder with the row permutation */ 1555 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1556 1557 /* First, solve U */ 1558 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1559 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1560 1561 /* Then, solve L */ 1562 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1563 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1564 1565 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1566 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1567 1568 /* Copy the temporary to the full solution. */ 1569 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1570 1571 /* restore */ 1572 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1573 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1574 PetscCall(PetscLogGpuTimeEnd()); 1575 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1576 PetscFunctionReturn(PETSC_SUCCESS); 1577 } 1578 1579 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1580 { 1581 const PetscScalar *barray; 1582 PetscScalar *xarray; 1583 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1584 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1585 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1586 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1587 1588 PetscFunctionBegin; 1589 /* Analyze the matrix and create the transpose ... on the fly */ 1590 if (!loTriFactorT && !upTriFactorT) { 1591 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1592 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1593 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1594 } 1595 1596 /* Get the GPU pointers */ 1597 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1598 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1599 1600 PetscCall(PetscLogGpuTimeBegin()); 1601 /* First, solve U */ 1602 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1603 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1604 1605 /* Then, solve L */ 1606 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1607 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1608 1609 /* restore */ 1610 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1611 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1612 PetscCall(PetscLogGpuTimeEnd()); 1613 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1614 PetscFunctionReturn(PETSC_SUCCESS); 1615 } 1616 1617 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1618 { 1619 const PetscScalar *barray; 1620 PetscScalar *xarray; 1621 thrust::device_ptr<const PetscScalar> bGPU; 1622 thrust::device_ptr<PetscScalar> xGPU; 1623 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1624 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1625 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1626 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1627 1628 PetscFunctionBegin; 1629 /* Get the GPU pointers */ 1630 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1631 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1632 xGPU = thrust::device_pointer_cast(xarray); 1633 bGPU = thrust::device_pointer_cast(barray); 1634 1635 PetscCall(PetscLogGpuTimeBegin()); 1636 /* First, reorder with the row permutation */ 1637 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1638 1639 /* Next, solve L */ 1640 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1641 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1642 1643 /* Then, solve U */ 1644 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1645 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1646 1647 /* Last, reorder with the column permutation */ 1648 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1649 1650 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1651 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1652 PetscCall(PetscLogGpuTimeEnd()); 1653 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1654 PetscFunctionReturn(PETSC_SUCCESS); 1655 } 1656 1657 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1658 { 1659 const PetscScalar *barray; 1660 PetscScalar *xarray; 1661 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1662 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1663 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1664 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1665 1666 PetscFunctionBegin; 1667 /* Get the GPU pointers */ 1668 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1669 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1670 1671 PetscCall(PetscLogGpuTimeBegin()); 1672 /* First, solve L */ 1673 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1674 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1675 1676 /* Next, solve U */ 1677 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1678 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1679 1680 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1681 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1682 PetscCall(PetscLogGpuTimeEnd()); 1683 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1684 PetscFunctionReturn(PETSC_SUCCESS); 1685 } 1686 #endif 1687 1688 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1689 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1690 { 1691 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1692 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1693 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1694 CsrMatrix *Acsr; 1695 PetscInt m, nz; 1696 PetscBool flg; 1697 1698 PetscFunctionBegin; 1699 if (PetscDefined(USE_DEBUG)) { 1700 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1701 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1702 } 1703 1704 /* Copy A's value to fact */ 1705 m = fact->rmap->n; 1706 nz = aij->nz; 1707 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1708 Acsr = (CsrMatrix *)Acusp->mat->mat; 1709 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1710 1711 PetscCall(PetscLogGpuTimeBegin()); 1712 /* Factorize fact inplace */ 1713 if (m) 1714 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1715 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1716 if (PetscDefined(USE_DEBUG)) { 1717 int numerical_zero; 1718 cusparseStatus_t status; 1719 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1720 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1721 } 1722 1723 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1724 if (fs->updatedSpSVAnalysis) { 1725 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1726 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1727 } else 1728 #endif 1729 { 1730 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1731 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1732 */ 1733 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1734 1735 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1736 1737 fs->updatedSpSVAnalysis = PETSC_TRUE; 1738 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1739 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1740 } 1741 1742 fact->offloadmask = PETSC_OFFLOAD_GPU; 1743 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1744 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1745 fact->ops->matsolve = NULL; 1746 fact->ops->matsolvetranspose = NULL; 1747 PetscCall(PetscLogGpuTimeEnd()); 1748 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1749 PetscFunctionReturn(PETSC_SUCCESS); 1750 } 1751 1752 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1753 { 1754 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1755 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1756 PetscInt m, nz; 1757 1758 PetscFunctionBegin; 1759 if (PetscDefined(USE_DEBUG)) { 1760 PetscInt i; 1761 PetscBool flg, missing; 1762 1763 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1764 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1765 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1766 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1767 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1768 } 1769 1770 /* Free the old stale stuff */ 1771 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1772 1773 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1774 but they will not be used. Allocate them just for easy debugging. 1775 */ 1776 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1777 1778 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1779 fact->factortype = MAT_FACTOR_ILU; 1780 fact->info.factor_mallocs = 0; 1781 fact->info.fill_ratio_given = info->fill; 1782 fact->info.fill_ratio_needed = 1.0; 1783 1784 aij->row = NULL; 1785 aij->col = NULL; 1786 1787 /* ====================================================================== */ 1788 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1789 /* We'll do in-place factorization on fact */ 1790 /* ====================================================================== */ 1791 const int *Ai, *Aj; 1792 1793 m = fact->rmap->n; 1794 nz = aij->nz; 1795 1796 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1797 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1798 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1799 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1800 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1801 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1802 1803 /* ====================================================================== */ 1804 /* Create descriptors for M, L, U */ 1805 /* ====================================================================== */ 1806 cusparseFillMode_t fillMode; 1807 cusparseDiagType_t diagType; 1808 1809 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1810 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1811 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1812 1813 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1814 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1815 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1816 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1817 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1818 */ 1819 fillMode = CUSPARSE_FILL_MODE_LOWER; 1820 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1821 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1822 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1823 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1824 1825 fillMode = CUSPARSE_FILL_MODE_UPPER; 1826 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1827 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1828 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1829 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1830 1831 /* ========================================================================= */ 1832 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1833 /* ========================================================================= */ 1834 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1835 if (m) 1836 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1837 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1838 1839 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1840 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1841 1842 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1843 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1844 1845 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1846 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1847 1848 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1849 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1850 1851 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1852 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1853 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1854 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1855 */ 1856 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1857 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1858 fs->spsvBuffer_L = fs->factBuffer_M; 1859 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1860 } else { 1861 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1862 fs->spsvBuffer_U = fs->factBuffer_M; 1863 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1864 } 1865 1866 /* ========================================================================== */ 1867 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1868 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1869 /* ========================================================================== */ 1870 int structural_zero; 1871 cusparseStatus_t status; 1872 1873 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1874 if (m) 1875 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1876 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1877 if (PetscDefined(USE_DEBUG)) { 1878 /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1879 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1880 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1881 } 1882 1883 /* Estimate FLOPs of the numeric factorization */ 1884 { 1885 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1886 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1887 PetscLogDouble flops = 0.0; 1888 1889 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1890 Ai = Aseq->i; 1891 Adiag = Aseq->diag; 1892 for (PetscInt i = 0; i < m; i++) { 1893 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1894 nzRow = Ai[i + 1] - Ai[i]; 1895 nzLeft = Adiag[i] - Ai[i]; 1896 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1897 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1898 */ 1899 nzLeft = (nzRow - 1) / 2; 1900 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1901 } 1902 } 1903 fs->numericFactFlops = flops; 1904 } 1905 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1906 PetscFunctionReturn(PETSC_SUCCESS); 1907 } 1908 1909 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1910 { 1911 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1912 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1913 const PetscScalar *barray; 1914 PetscScalar *xarray; 1915 1916 PetscFunctionBegin; 1917 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1918 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1919 PetscCall(PetscLogGpuTimeBegin()); 1920 1921 /* Solve L*y = b */ 1922 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1923 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1924 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1925 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1926 1927 /* Solve Lt*x = y */ 1928 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1929 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1930 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1931 1932 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1933 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1934 1935 PetscCall(PetscLogGpuTimeEnd()); 1936 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1937 PetscFunctionReturn(PETSC_SUCCESS); 1938 } 1939 1940 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1941 { 1942 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1943 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1944 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1945 CsrMatrix *Acsr; 1946 PetscInt m, nz; 1947 PetscBool flg; 1948 1949 PetscFunctionBegin; 1950 if (PetscDefined(USE_DEBUG)) { 1951 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1952 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1953 } 1954 1955 /* Copy A's value to fact */ 1956 m = fact->rmap->n; 1957 nz = aij->nz; 1958 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1959 Acsr = (CsrMatrix *)Acusp->mat->mat; 1960 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1961 1962 /* Factorize fact inplace */ 1963 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1964 csric02() only takes the lower triangular part of matrix A to perform factorization. 1965 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1966 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1967 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1968 */ 1969 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1970 if (PetscDefined(USE_DEBUG)) { 1971 int numerical_zero; 1972 cusparseStatus_t status; 1973 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1974 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1975 } 1976 1977 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1978 if (fs->updatedSpSVAnalysis) { 1979 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1980 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1981 } else 1982 #endif 1983 { 1984 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1985 1986 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1987 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1988 */ 1989 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1990 fs->updatedSpSVAnalysis = PETSC_TRUE; 1991 } 1992 1993 fact->offloadmask = PETSC_OFFLOAD_GPU; 1994 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1995 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1996 fact->ops->matsolve = NULL; 1997 fact->ops->matsolvetranspose = NULL; 1998 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1999 PetscFunctionReturn(PETSC_SUCCESS); 2000 } 2001 2002 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2003 { 2004 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2005 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2006 PetscInt m, nz; 2007 2008 PetscFunctionBegin; 2009 if (PetscDefined(USE_DEBUG)) { 2010 PetscInt i; 2011 PetscBool flg, missing; 2012 2013 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2014 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2015 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2016 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2017 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2018 } 2019 2020 /* Free the old stale stuff */ 2021 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2022 2023 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2024 but they will not be used. Allocate them just for easy debugging. 2025 */ 2026 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2027 2028 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2029 fact->factortype = MAT_FACTOR_ICC; 2030 fact->info.factor_mallocs = 0; 2031 fact->info.fill_ratio_given = info->fill; 2032 fact->info.fill_ratio_needed = 1.0; 2033 2034 aij->row = NULL; 2035 aij->col = NULL; 2036 2037 /* ====================================================================== */ 2038 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2039 /* We'll do in-place factorization on fact */ 2040 /* ====================================================================== */ 2041 const int *Ai, *Aj; 2042 2043 m = fact->rmap->n; 2044 nz = aij->nz; 2045 2046 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2047 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2048 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2049 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2050 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2051 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2052 2053 /* ====================================================================== */ 2054 /* Create mat descriptors for M, L */ 2055 /* ====================================================================== */ 2056 cusparseFillMode_t fillMode; 2057 cusparseDiagType_t diagType; 2058 2059 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2060 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2061 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2062 2063 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2064 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2065 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2066 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2067 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2068 */ 2069 fillMode = CUSPARSE_FILL_MODE_LOWER; 2070 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2071 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2072 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2073 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2074 2075 /* ========================================================================= */ 2076 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2077 /* ========================================================================= */ 2078 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2079 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2080 2081 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2082 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2083 2084 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2085 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2086 2087 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2088 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2089 2090 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2091 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2092 2093 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2094 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2095 */ 2096 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2097 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2098 fs->spsvBuffer_L = fs->factBuffer_M; 2099 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2100 } else { 2101 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2102 fs->spsvBuffer_Lt = fs->factBuffer_M; 2103 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2104 } 2105 2106 /* ========================================================================== */ 2107 /* Perform analysis of ic0 on M */ 2108 /* The lower triangular part of M has the same sparsity pattern as L */ 2109 /* ========================================================================== */ 2110 int structural_zero; 2111 cusparseStatus_t status; 2112 2113 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2114 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2115 if (PetscDefined(USE_DEBUG)) { 2116 /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2117 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2118 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2119 } 2120 2121 /* Estimate FLOPs of the numeric factorization */ 2122 { 2123 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2124 PetscInt *Ai, nzRow, nzLeft; 2125 PetscLogDouble flops = 0.0; 2126 2127 Ai = Aseq->i; 2128 for (PetscInt i = 0; i < m; i++) { 2129 nzRow = Ai[i + 1] - Ai[i]; 2130 if (nzRow > 1) { 2131 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2132 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2133 */ 2134 nzLeft = (nzRow - 1) / 2; 2135 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2136 } 2137 } 2138 fs->numericFactFlops = flops; 2139 } 2140 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2141 PetscFunctionReturn(PETSC_SUCCESS); 2142 } 2143 #endif 2144 2145 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2146 { 2147 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2148 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2149 2150 PetscFunctionBegin; 2151 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2152 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2153 B->offloadmask = PETSC_OFFLOAD_CPU; 2154 2155 if (!cusparsestruct->use_cpu_solve) { 2156 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2157 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2158 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2159 #else 2160 /* determine which version of MatSolve needs to be used. */ 2161 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2162 IS isrow = b->row, iscol = b->col; 2163 PetscBool row_identity, col_identity; 2164 2165 PetscCall(ISIdentity(isrow, &row_identity)); 2166 PetscCall(ISIdentity(iscol, &col_identity)); 2167 if (row_identity && col_identity) { 2168 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2169 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2170 } else { 2171 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2172 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2173 } 2174 #endif 2175 } 2176 B->ops->matsolve = NULL; 2177 B->ops->matsolvetranspose = NULL; 2178 2179 /* get the triangular factors */ 2180 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2181 PetscFunctionReturn(PETSC_SUCCESS); 2182 } 2183 2184 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2185 { 2186 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2187 2188 PetscFunctionBegin; 2189 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2190 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2191 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2192 PetscFunctionReturn(PETSC_SUCCESS); 2193 } 2194 2195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2196 { 2197 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2198 2199 PetscFunctionBegin; 2200 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2201 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2202 if (!info->factoronhost) { 2203 PetscCall(ISIdentity(isrow, &row_identity)); 2204 PetscCall(ISIdentity(iscol, &col_identity)); 2205 } 2206 if (!info->levels && row_identity && col_identity) { 2207 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2208 } else 2209 #endif 2210 { 2211 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2212 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2213 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2214 } 2215 PetscFunctionReturn(PETSC_SUCCESS); 2216 } 2217 2218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2219 { 2220 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2221 2222 PetscFunctionBegin; 2223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2224 PetscBool perm_identity = PETSC_FALSE; 2225 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2226 if (!info->levels && perm_identity) { 2227 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2228 } else 2229 #endif 2230 { 2231 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2232 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2233 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2234 } 2235 PetscFunctionReturn(PETSC_SUCCESS); 2236 } 2237 2238 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2239 { 2240 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2241 2242 PetscFunctionBegin; 2243 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2244 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2245 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2246 PetscFunctionReturn(PETSC_SUCCESS); 2247 } 2248 2249 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2250 { 2251 PetscFunctionBegin; 2252 *type = MATSOLVERCUSPARSE; 2253 PetscFunctionReturn(PETSC_SUCCESS); 2254 } 2255 2256 /*MC 2257 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2258 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2259 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2260 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2261 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2262 algorithms are not recommended. This class does NOT support direct solver operations. 2263 2264 Level: beginner 2265 2266 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2267 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2268 M*/ 2269 2270 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2271 { 2272 PetscInt n = A->rmap->n; 2273 2274 PetscFunctionBegin; 2275 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2276 PetscCall(MatSetSizes(*B, n, n, n, n)); 2277 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2278 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2279 2280 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2281 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2282 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2283 if (!A->boundtocpu) { 2284 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2285 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2286 } else { 2287 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2288 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2289 } 2290 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2291 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2292 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2293 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2294 if (!A->boundtocpu) { 2295 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2296 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2297 } else { 2298 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2299 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2300 } 2301 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2302 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2303 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2304 2305 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2306 (*B)->canuseordering = PETSC_TRUE; 2307 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2308 PetscFunctionReturn(PETSC_SUCCESS); 2309 } 2310 2311 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2312 { 2313 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2314 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2315 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2316 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2317 #endif 2318 2319 PetscFunctionBegin; 2320 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2321 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2322 if (A->factortype == MAT_FACTOR_NONE) { 2323 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2324 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2325 } 2326 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2327 else if (fs->csrVal) { 2328 /* We have a factorized matrix on device and are able to copy it to host */ 2329 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2330 } 2331 #endif 2332 else 2333 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2334 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2335 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2336 A->offloadmask = PETSC_OFFLOAD_BOTH; 2337 } 2338 PetscFunctionReturn(PETSC_SUCCESS); 2339 } 2340 2341 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2342 { 2343 PetscFunctionBegin; 2344 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2345 *array = ((Mat_SeqAIJ *)A->data)->a; 2346 PetscFunctionReturn(PETSC_SUCCESS); 2347 } 2348 2349 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2350 { 2351 PetscFunctionBegin; 2352 A->offloadmask = PETSC_OFFLOAD_CPU; 2353 *array = NULL; 2354 PetscFunctionReturn(PETSC_SUCCESS); 2355 } 2356 2357 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2358 { 2359 PetscFunctionBegin; 2360 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2361 *array = ((Mat_SeqAIJ *)A->data)->a; 2362 PetscFunctionReturn(PETSC_SUCCESS); 2363 } 2364 2365 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2366 { 2367 PetscFunctionBegin; 2368 *array = NULL; 2369 PetscFunctionReturn(PETSC_SUCCESS); 2370 } 2371 2372 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2373 { 2374 PetscFunctionBegin; 2375 *array = ((Mat_SeqAIJ *)A->data)->a; 2376 PetscFunctionReturn(PETSC_SUCCESS); 2377 } 2378 2379 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2380 { 2381 PetscFunctionBegin; 2382 A->offloadmask = PETSC_OFFLOAD_CPU; 2383 *array = NULL; 2384 PetscFunctionReturn(PETSC_SUCCESS); 2385 } 2386 2387 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2388 { 2389 Mat_SeqAIJCUSPARSE *cusp; 2390 CsrMatrix *matrix; 2391 2392 PetscFunctionBegin; 2393 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2394 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2395 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2396 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2397 matrix = (CsrMatrix *)cusp->mat->mat; 2398 2399 if (i) { 2400 #if !defined(PETSC_USE_64BIT_INDICES) 2401 *i = matrix->row_offsets->data().get(); 2402 #else 2403 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2404 #endif 2405 } 2406 if (j) { 2407 #if !defined(PETSC_USE_64BIT_INDICES) 2408 *j = matrix->column_indices->data().get(); 2409 #else 2410 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2411 #endif 2412 } 2413 if (a) *a = matrix->values->data().get(); 2414 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2415 PetscFunctionReturn(PETSC_SUCCESS); 2416 } 2417 2418 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2419 { 2420 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2421 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2422 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2423 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2424 cusparseStatus_t stat; 2425 PetscBool both = PETSC_TRUE; 2426 2427 PetscFunctionBegin; 2428 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2429 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2430 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2431 CsrMatrix *matrix; 2432 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2433 2434 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2435 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2436 matrix->values->assign(a->a, a->a + a->nz); 2437 PetscCallCUDA(WaitForCUDA()); 2438 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2439 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2440 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2441 } else { 2442 PetscInt nnz; 2443 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2444 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2445 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2446 delete cusparsestruct->workVector; 2447 delete cusparsestruct->rowoffsets_gpu; 2448 cusparsestruct->workVector = NULL; 2449 cusparsestruct->rowoffsets_gpu = NULL; 2450 try { 2451 if (a->compressedrow.use) { 2452 m = a->compressedrow.nrows; 2453 ii = a->compressedrow.i; 2454 ridx = a->compressedrow.rindex; 2455 } else { 2456 m = A->rmap->n; 2457 ii = a->i; 2458 ridx = NULL; 2459 } 2460 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2461 if (!a->a) { 2462 nnz = ii[m]; 2463 both = PETSC_FALSE; 2464 } else nnz = a->nz; 2465 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2466 2467 /* create cusparse matrix */ 2468 cusparsestruct->nrows = m; 2469 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2470 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2471 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2472 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2473 2474 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2475 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2476 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2477 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2478 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2479 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2480 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2481 2482 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2483 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2484 /* set the matrix */ 2485 CsrMatrix *mat = new CsrMatrix; 2486 mat->num_rows = m; 2487 mat->num_cols = A->cmap->n; 2488 mat->num_entries = nnz; 2489 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2490 mat->row_offsets->assign(ii, ii + m + 1); 2491 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2492 mat->column_indices->assign(a->j, a->j + nnz); 2493 2494 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2495 if (a->a) mat->values->assign(a->a, a->a + nnz); 2496 2497 /* assign the pointer */ 2498 matstruct->mat = mat; 2499 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2500 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2501 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2502 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2503 PetscCallCUSPARSE(stat); 2504 } 2505 #endif 2506 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2507 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2508 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2509 #else 2510 CsrMatrix *mat = new CsrMatrix; 2511 mat->num_rows = m; 2512 mat->num_cols = A->cmap->n; 2513 mat->num_entries = nnz; 2514 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2515 mat->row_offsets->assign(ii, ii + m + 1); 2516 2517 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2518 mat->column_indices->assign(a->j, a->j + nnz); 2519 2520 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2521 if (a->a) mat->values->assign(a->a, a->a + nnz); 2522 2523 cusparseHybMat_t hybMat; 2524 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2525 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2526 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2527 PetscCallCUSPARSE(stat); 2528 /* assign the pointer */ 2529 matstruct->mat = hybMat; 2530 2531 if (mat) { 2532 if (mat->values) delete (THRUSTARRAY *)mat->values; 2533 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2534 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2535 delete (CsrMatrix *)mat; 2536 } 2537 #endif 2538 } 2539 2540 /* assign the compressed row indices */ 2541 if (a->compressedrow.use) { 2542 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2543 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2544 matstruct->cprowIndices->assign(ridx, ridx + m); 2545 tmp = m; 2546 } else { 2547 cusparsestruct->workVector = NULL; 2548 matstruct->cprowIndices = NULL; 2549 tmp = 0; 2550 } 2551 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2552 2553 /* assign the pointer */ 2554 cusparsestruct->mat = matstruct; 2555 } catch (char *ex) { 2556 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2557 } 2558 PetscCallCUDA(WaitForCUDA()); 2559 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2560 cusparsestruct->nonzerostate = A->nonzerostate; 2561 } 2562 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2563 } 2564 PetscFunctionReturn(PETSC_SUCCESS); 2565 } 2566 2567 struct VecCUDAPlusEquals { 2568 template <typename Tuple> 2569 __host__ __device__ void operator()(Tuple t) 2570 { 2571 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2572 } 2573 }; 2574 2575 struct VecCUDAEquals { 2576 template <typename Tuple> 2577 __host__ __device__ void operator()(Tuple t) 2578 { 2579 thrust::get<1>(t) = thrust::get<0>(t); 2580 } 2581 }; 2582 2583 struct VecCUDAEqualsReverse { 2584 template <typename Tuple> 2585 __host__ __device__ void operator()(Tuple t) 2586 { 2587 thrust::get<0>(t) = thrust::get<1>(t); 2588 } 2589 }; 2590 2591 struct MatMatCusparse { 2592 PetscBool cisdense; 2593 PetscScalar *Bt; 2594 Mat X; 2595 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2596 PetscLogDouble flops; 2597 CsrMatrix *Bcsr; 2598 2599 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2600 cusparseSpMatDescr_t matSpBDescr; 2601 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2602 cusparseDnMatDescr_t matBDescr; 2603 cusparseDnMatDescr_t matCDescr; 2604 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2605 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2606 void *dBuffer4; 2607 void *dBuffer5; 2608 #endif 2609 size_t mmBufferSize; 2610 void *mmBuffer; 2611 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2612 cusparseSpGEMMDescr_t spgemmDesc; 2613 #endif 2614 }; 2615 2616 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2617 { 2618 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2619 2620 PetscFunctionBegin; 2621 PetscCallCUDA(cudaFree(mmdata->Bt)); 2622 delete mmdata->Bcsr; 2623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2624 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2625 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2626 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2627 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2628 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2629 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2630 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2631 #endif 2632 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2633 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2634 #endif 2635 PetscCall(MatDestroy(&mmdata->X)); 2636 PetscCall(PetscFree(data)); 2637 PetscFunctionReturn(PETSC_SUCCESS); 2638 } 2639 2640 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2641 2642 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2643 { 2644 Mat_Product *product = C->product; 2645 Mat A, B; 2646 PetscInt m, n, blda, clda; 2647 PetscBool flg, biscuda; 2648 Mat_SeqAIJCUSPARSE *cusp; 2649 cusparseStatus_t stat; 2650 cusparseOperation_t opA; 2651 const PetscScalar *barray; 2652 PetscScalar *carray; 2653 MatMatCusparse *mmdata; 2654 Mat_SeqAIJCUSPARSEMultStruct *mat; 2655 CsrMatrix *csrmat; 2656 2657 PetscFunctionBegin; 2658 MatCheckProduct(C, 1); 2659 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2660 mmdata = (MatMatCusparse *)product->data; 2661 A = product->A; 2662 B = product->B; 2663 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2664 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2665 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2666 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2667 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2668 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2669 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2670 switch (product->type) { 2671 case MATPRODUCT_AB: 2672 case MATPRODUCT_PtAP: 2673 mat = cusp->mat; 2674 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2675 m = A->rmap->n; 2676 n = B->cmap->n; 2677 break; 2678 case MATPRODUCT_AtB: 2679 if (!A->form_explicit_transpose) { 2680 mat = cusp->mat; 2681 opA = CUSPARSE_OPERATION_TRANSPOSE; 2682 } else { 2683 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2684 mat = cusp->matTranspose; 2685 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2686 } 2687 m = A->cmap->n; 2688 n = B->cmap->n; 2689 break; 2690 case MATPRODUCT_ABt: 2691 case MATPRODUCT_RARt: 2692 mat = cusp->mat; 2693 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2694 m = A->rmap->n; 2695 n = B->rmap->n; 2696 break; 2697 default: 2698 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2699 } 2700 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2701 csrmat = (CsrMatrix *)mat->mat; 2702 /* if the user passed a CPU matrix, copy the data to the GPU */ 2703 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2704 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2705 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2706 2707 PetscCall(MatDenseGetLDA(B, &blda)); 2708 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2709 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2710 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2711 } else { 2712 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2713 PetscCall(MatDenseGetLDA(C, &clda)); 2714 } 2715 2716 PetscCall(PetscLogGpuTimeBegin()); 2717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2718 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2719 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2720 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2721 #else 2722 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2723 #endif 2724 2725 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2726 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2727 size_t mmBufferSize; 2728 if (mmdata->initialized && mmdata->Blda != blda) { 2729 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2730 mmdata->matBDescr = NULL; 2731 } 2732 if (!mmdata->matBDescr) { 2733 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2734 mmdata->Blda = blda; 2735 } 2736 2737 if (mmdata->initialized && mmdata->Clda != clda) { 2738 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2739 mmdata->matCDescr = NULL; 2740 } 2741 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2742 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2743 mmdata->Clda = clda; 2744 } 2745 2746 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2747 if (matADescr) { 2748 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2749 matADescr = NULL; 2750 } 2751 #endif 2752 2753 if (!matADescr) { 2754 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2755 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2756 PetscCallCUSPARSE(stat); 2757 } 2758 2759 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2760 2761 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2762 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2763 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2764 mmdata->mmBufferSize = mmBufferSize; 2765 } 2766 2767 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0 2768 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2769 #endif 2770 2771 mmdata->initialized = PETSC_TRUE; 2772 } else { 2773 /* to be safe, always update pointers of the mats */ 2774 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2775 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2776 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2777 } 2778 2779 /* do cusparseSpMM, which supports transpose on B */ 2780 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2781 #else 2782 PetscInt k; 2783 /* cusparseXcsrmm does not support transpose on B */ 2784 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2785 cublasHandle_t cublasv2handle; 2786 cublasStatus_t cerr; 2787 2788 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2789 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2790 PetscCallCUBLAS(cerr); 2791 blda = B->cmap->n; 2792 k = B->cmap->n; 2793 } else { 2794 k = B->rmap->n; 2795 } 2796 2797 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2798 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2799 PetscCallCUSPARSE(stat); 2800 #endif 2801 PetscCall(PetscLogGpuTimeEnd()); 2802 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2803 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2804 if (product->type == MATPRODUCT_RARt) { 2805 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2806 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2807 } else if (product->type == MATPRODUCT_PtAP) { 2808 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2809 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2810 } else { 2811 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2812 } 2813 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2814 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2815 PetscFunctionReturn(PETSC_SUCCESS); 2816 } 2817 2818 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2819 { 2820 Mat_Product *product = C->product; 2821 Mat A, B; 2822 PetscInt m, n; 2823 PetscBool cisdense, flg; 2824 MatMatCusparse *mmdata; 2825 Mat_SeqAIJCUSPARSE *cusp; 2826 2827 PetscFunctionBegin; 2828 MatCheckProduct(C, 1); 2829 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2830 A = product->A; 2831 B = product->B; 2832 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2833 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2834 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2835 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2836 switch (product->type) { 2837 case MATPRODUCT_AB: 2838 m = A->rmap->n; 2839 n = B->cmap->n; 2840 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2841 break; 2842 case MATPRODUCT_AtB: 2843 m = A->cmap->n; 2844 n = B->cmap->n; 2845 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 2846 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2847 break; 2848 case MATPRODUCT_ABt: 2849 m = A->rmap->n; 2850 n = B->rmap->n; 2851 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 2852 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2853 break; 2854 case MATPRODUCT_PtAP: 2855 m = B->cmap->n; 2856 n = B->cmap->n; 2857 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 2858 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2859 break; 2860 case MATPRODUCT_RARt: 2861 m = B->rmap->n; 2862 n = B->rmap->n; 2863 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 2864 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2865 break; 2866 default: 2867 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2868 } 2869 PetscCall(MatSetSizes(C, m, n, m, n)); 2870 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2871 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2872 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2873 2874 /* product data */ 2875 PetscCall(PetscNew(&mmdata)); 2876 mmdata->cisdense = cisdense; 2877 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2878 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2879 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2880 #endif 2881 /* for these products we need intermediate storage */ 2882 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2883 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2884 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2885 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2886 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2887 } else { 2888 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2889 } 2890 } 2891 C->product->data = mmdata; 2892 C->product->destroy = MatDestroy_MatMatCusparse; 2893 2894 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2895 PetscFunctionReturn(PETSC_SUCCESS); 2896 } 2897 2898 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2899 { 2900 Mat_Product *product = C->product; 2901 Mat A, B; 2902 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2903 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2904 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2905 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2906 PetscBool flg; 2907 cusparseStatus_t stat; 2908 MatProductType ptype; 2909 MatMatCusparse *mmdata; 2910 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2911 cusparseSpMatDescr_t BmatSpDescr; 2912 #endif 2913 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2914 2915 PetscFunctionBegin; 2916 MatCheckProduct(C, 1); 2917 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2918 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2919 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2920 mmdata = (MatMatCusparse *)C->product->data; 2921 A = product->A; 2922 B = product->B; 2923 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2924 mmdata->reusesym = PETSC_FALSE; 2925 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2926 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2927 Cmat = Ccusp->mat; 2928 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2929 Ccsr = (CsrMatrix *)Cmat->mat; 2930 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2931 goto finalize; 2932 } 2933 if (!c->nz) goto finalize; 2934 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2935 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2936 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2937 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2938 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2939 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2940 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2941 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2942 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2943 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2944 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2945 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2946 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2947 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2948 2949 ptype = product->type; 2950 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2951 ptype = MATPRODUCT_AB; 2952 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2953 } 2954 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2955 ptype = MATPRODUCT_AB; 2956 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2957 } 2958 switch (ptype) { 2959 case MATPRODUCT_AB: 2960 Amat = Acusp->mat; 2961 Bmat = Bcusp->mat; 2962 break; 2963 case MATPRODUCT_AtB: 2964 Amat = Acusp->matTranspose; 2965 Bmat = Bcusp->mat; 2966 break; 2967 case MATPRODUCT_ABt: 2968 Amat = Acusp->mat; 2969 Bmat = Bcusp->matTranspose; 2970 break; 2971 default: 2972 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2973 } 2974 Cmat = Ccusp->mat; 2975 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2976 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2977 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2978 Acsr = (CsrMatrix *)Amat->mat; 2979 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2980 Ccsr = (CsrMatrix *)Cmat->mat; 2981 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2982 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2983 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2984 PetscCall(PetscLogGpuTimeBegin()); 2985 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2986 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2987 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2988 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2989 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2990 PetscCallCUSPARSE(stat); 2991 #else 2992 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2993 PetscCallCUSPARSE(stat); 2994 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2995 PetscCallCUSPARSE(stat); 2996 #endif 2997 #else 2998 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2999 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3000 PetscCallCUSPARSE(stat); 3001 #endif 3002 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3003 PetscCallCUDA(WaitForCUDA()); 3004 PetscCall(PetscLogGpuTimeEnd()); 3005 C->offloadmask = PETSC_OFFLOAD_GPU; 3006 finalize: 3007 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3008 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3009 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3010 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3011 c->reallocs = 0; 3012 C->info.mallocs += 0; 3013 C->info.nz_unneeded = 0; 3014 C->assembled = C->was_assembled = PETSC_TRUE; 3015 C->num_ass++; 3016 PetscFunctionReturn(PETSC_SUCCESS); 3017 } 3018 3019 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3020 { 3021 Mat_Product *product = C->product; 3022 Mat A, B; 3023 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3024 Mat_SeqAIJ *a, *b, *c; 3025 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3026 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3027 PetscInt i, j, m, n, k; 3028 PetscBool flg; 3029 cusparseStatus_t stat; 3030 MatProductType ptype; 3031 MatMatCusparse *mmdata; 3032 PetscLogDouble flops; 3033 PetscBool biscompressed, ciscompressed; 3034 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3035 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3036 cusparseSpMatDescr_t BmatSpDescr; 3037 #else 3038 int cnz; 3039 #endif 3040 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3041 3042 PetscFunctionBegin; 3043 MatCheckProduct(C, 1); 3044 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3045 A = product->A; 3046 B = product->B; 3047 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3048 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3049 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3050 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3051 a = (Mat_SeqAIJ *)A->data; 3052 b = (Mat_SeqAIJ *)B->data; 3053 /* product data */ 3054 PetscCall(PetscNew(&mmdata)); 3055 C->product->data = mmdata; 3056 C->product->destroy = MatDestroy_MatMatCusparse; 3057 3058 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3059 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3060 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3061 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3062 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3063 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3064 3065 ptype = product->type; 3066 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3067 ptype = MATPRODUCT_AB; 3068 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3069 } 3070 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3071 ptype = MATPRODUCT_AB; 3072 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3073 } 3074 biscompressed = PETSC_FALSE; 3075 ciscompressed = PETSC_FALSE; 3076 switch (ptype) { 3077 case MATPRODUCT_AB: 3078 m = A->rmap->n; 3079 n = B->cmap->n; 3080 k = A->cmap->n; 3081 Amat = Acusp->mat; 3082 Bmat = Bcusp->mat; 3083 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3084 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3085 break; 3086 case MATPRODUCT_AtB: 3087 m = A->cmap->n; 3088 n = B->cmap->n; 3089 k = A->rmap->n; 3090 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3091 Amat = Acusp->matTranspose; 3092 Bmat = Bcusp->mat; 3093 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3094 break; 3095 case MATPRODUCT_ABt: 3096 m = A->rmap->n; 3097 n = B->rmap->n; 3098 k = A->cmap->n; 3099 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3100 Amat = Acusp->mat; 3101 Bmat = Bcusp->matTranspose; 3102 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3103 break; 3104 default: 3105 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3106 } 3107 3108 /* create cusparse matrix */ 3109 PetscCall(MatSetSizes(C, m, n, m, n)); 3110 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3111 c = (Mat_SeqAIJ *)C->data; 3112 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3113 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3114 Ccsr = new CsrMatrix; 3115 3116 c->compressedrow.use = ciscompressed; 3117 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3118 c->compressedrow.nrows = a->compressedrow.nrows; 3119 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3120 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3121 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3122 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3123 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3124 } else { 3125 c->compressedrow.nrows = 0; 3126 c->compressedrow.i = NULL; 3127 c->compressedrow.rindex = NULL; 3128 Ccusp->workVector = NULL; 3129 Cmat->cprowIndices = NULL; 3130 } 3131 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3132 Ccusp->mat = Cmat; 3133 Ccusp->mat->mat = Ccsr; 3134 Ccsr->num_rows = Ccusp->nrows; 3135 Ccsr->num_cols = n; 3136 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3137 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3138 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3139 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3140 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3141 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3142 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3143 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3144 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3145 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3146 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3147 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3148 c->nz = 0; 3149 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3150 Ccsr->values = new THRUSTARRAY(c->nz); 3151 goto finalizesym; 3152 } 3153 3154 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3155 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3156 Acsr = (CsrMatrix *)Amat->mat; 3157 if (!biscompressed) { 3158 Bcsr = (CsrMatrix *)Bmat->mat; 3159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3160 BmatSpDescr = Bmat->matDescr; 3161 #endif 3162 } else { /* we need to use row offsets for the full matrix */ 3163 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3164 Bcsr = new CsrMatrix; 3165 Bcsr->num_rows = B->rmap->n; 3166 Bcsr->num_cols = cBcsr->num_cols; 3167 Bcsr->num_entries = cBcsr->num_entries; 3168 Bcsr->column_indices = cBcsr->column_indices; 3169 Bcsr->values = cBcsr->values; 3170 if (!Bcusp->rowoffsets_gpu) { 3171 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3172 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3173 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3174 } 3175 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3176 mmdata->Bcsr = Bcsr; 3177 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3178 if (Bcsr->num_rows && Bcsr->num_cols) { 3179 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3180 PetscCallCUSPARSE(stat); 3181 } 3182 BmatSpDescr = mmdata->matSpBDescr; 3183 #endif 3184 } 3185 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3186 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3187 /* precompute flops count */ 3188 if (ptype == MATPRODUCT_AB) { 3189 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3190 const PetscInt st = a->i[i]; 3191 const PetscInt en = a->i[i + 1]; 3192 for (j = st; j < en; j++) { 3193 const PetscInt brow = a->j[j]; 3194 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3195 } 3196 } 3197 } else if (ptype == MATPRODUCT_AtB) { 3198 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3199 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3200 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3201 flops += (2. * anzi) * bnzi; 3202 } 3203 } else { /* TODO */ 3204 flops = 0.; 3205 } 3206 3207 mmdata->flops = flops; 3208 PetscCall(PetscLogGpuTimeBegin()); 3209 3210 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3211 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3212 // cuda-12.2 requires non-null csrRowOffsets 3213 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3214 PetscCallCUSPARSE(stat); 3215 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3216 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3217 { 3218 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3219 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3220 */ 3221 void *dBuffer1 = NULL; 3222 void *dBuffer2 = NULL; 3223 void *dBuffer3 = NULL; 3224 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3225 size_t bufferSize1 = 0; 3226 size_t bufferSize2 = 0; 3227 size_t bufferSize3 = 0; 3228 size_t bufferSize4 = 0; 3229 size_t bufferSize5 = 0; 3230 3231 /* ask bufferSize1 bytes for external memory */ 3232 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3233 PetscCallCUSPARSE(stat); 3234 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3235 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3236 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3237 PetscCallCUSPARSE(stat); 3238 3239 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3240 PetscCallCUSPARSE(stat); 3241 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3242 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3243 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3244 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3245 PetscCallCUSPARSE(stat); 3246 PetscCallCUDA(cudaFree(dBuffer1)); 3247 PetscCallCUDA(cudaFree(dBuffer2)); 3248 3249 /* get matrix C non-zero entries C_nnz1 */ 3250 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3251 c->nz = (PetscInt)C_nnz1; 3252 /* allocate matrix C */ 3253 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3254 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3255 Ccsr->values = new THRUSTARRAY(c->nz); 3256 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3257 /* update matC with the new pointers */ 3258 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3259 PetscCallCUSPARSE(stat); 3260 3261 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3262 PetscCallCUSPARSE(stat); 3263 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3264 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3265 PetscCallCUSPARSE(stat); 3266 PetscCallCUDA(cudaFree(dBuffer3)); 3267 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3268 PetscCallCUSPARSE(stat); 3269 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3270 } 3271 #else 3272 size_t bufSize2; 3273 /* ask bufferSize bytes for external memory */ 3274 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3275 PetscCallCUSPARSE(stat); 3276 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3277 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3278 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3279 PetscCallCUSPARSE(stat); 3280 /* ask bufferSize again bytes for external memory */ 3281 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3282 PetscCallCUSPARSE(stat); 3283 /* The CUSPARSE documentation is not clear, nor the API 3284 We need both buffers to perform the operations properly! 3285 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3286 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3287 is stored in the descriptor! What a messy API... */ 3288 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3289 /* compute the intermediate product of A * B */ 3290 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3291 PetscCallCUSPARSE(stat); 3292 /* get matrix C non-zero entries C_nnz1 */ 3293 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3294 c->nz = (PetscInt)C_nnz1; 3295 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3296 mmdata->mmBufferSize / 1024)); 3297 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3298 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3299 Ccsr->values = new THRUSTARRAY(c->nz); 3300 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3301 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3302 PetscCallCUSPARSE(stat); 3303 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3304 PetscCallCUSPARSE(stat); 3305 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3306 #else 3307 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3308 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3309 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3310 PetscCallCUSPARSE(stat); 3311 c->nz = cnz; 3312 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3313 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3314 Ccsr->values = new THRUSTARRAY(c->nz); 3315 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3316 3317 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3318 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3319 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3320 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3321 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3322 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3323 PetscCallCUSPARSE(stat); 3324 #endif 3325 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3326 PetscCall(PetscLogGpuTimeEnd()); 3327 finalizesym: 3328 c->free_a = PETSC_TRUE; 3329 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3330 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3331 c->free_ij = PETSC_TRUE; 3332 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3333 PetscInt *d_i = c->i; 3334 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3335 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3336 ii = *Ccsr->row_offsets; 3337 jj = *Ccsr->column_indices; 3338 if (ciscompressed) d_i = c->compressedrow.i; 3339 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3340 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3341 } else { 3342 PetscInt *d_i = c->i; 3343 if (ciscompressed) d_i = c->compressedrow.i; 3344 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3345 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3346 } 3347 if (ciscompressed) { /* need to expand host row offsets */ 3348 PetscInt r = 0; 3349 c->i[0] = 0; 3350 for (k = 0; k < c->compressedrow.nrows; k++) { 3351 const PetscInt next = c->compressedrow.rindex[k]; 3352 const PetscInt old = c->compressedrow.i[k]; 3353 for (; r < next; r++) c->i[r + 1] = old; 3354 } 3355 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3356 } 3357 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3358 PetscCall(PetscMalloc1(m, &c->ilen)); 3359 PetscCall(PetscMalloc1(m, &c->imax)); 3360 c->maxnz = c->nz; 3361 c->nonzerorowcnt = 0; 3362 c->rmax = 0; 3363 for (k = 0; k < m; k++) { 3364 const PetscInt nn = c->i[k + 1] - c->i[k]; 3365 c->ilen[k] = c->imax[k] = nn; 3366 c->nonzerorowcnt += (PetscInt)!!nn; 3367 c->rmax = PetscMax(c->rmax, nn); 3368 } 3369 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3370 PetscCall(PetscMalloc1(c->nz, &c->a)); 3371 Ccsr->num_entries = c->nz; 3372 3373 C->nonzerostate++; 3374 PetscCall(PetscLayoutSetUp(C->rmap)); 3375 PetscCall(PetscLayoutSetUp(C->cmap)); 3376 Ccusp->nonzerostate = C->nonzerostate; 3377 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3378 C->preallocated = PETSC_TRUE; 3379 C->assembled = PETSC_FALSE; 3380 C->was_assembled = PETSC_FALSE; 3381 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3382 mmdata->reusesym = PETSC_TRUE; 3383 C->offloadmask = PETSC_OFFLOAD_GPU; 3384 } 3385 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3386 PetscFunctionReturn(PETSC_SUCCESS); 3387 } 3388 3389 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3390 3391 /* handles sparse or dense B */ 3392 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3393 { 3394 Mat_Product *product = mat->product; 3395 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3396 3397 PetscFunctionBegin; 3398 MatCheckProduct(mat, 1); 3399 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3400 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3401 if (product->type == MATPRODUCT_ABC) { 3402 Ciscusp = PETSC_FALSE; 3403 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3404 } 3405 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3406 PetscBool usecpu = PETSC_FALSE; 3407 switch (product->type) { 3408 case MATPRODUCT_AB: 3409 if (product->api_user) { 3410 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3411 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3412 PetscOptionsEnd(); 3413 } else { 3414 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3415 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3416 PetscOptionsEnd(); 3417 } 3418 break; 3419 case MATPRODUCT_AtB: 3420 if (product->api_user) { 3421 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3422 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3423 PetscOptionsEnd(); 3424 } else { 3425 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3426 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3427 PetscOptionsEnd(); 3428 } 3429 break; 3430 case MATPRODUCT_PtAP: 3431 if (product->api_user) { 3432 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3433 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3434 PetscOptionsEnd(); 3435 } else { 3436 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3437 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3438 PetscOptionsEnd(); 3439 } 3440 break; 3441 case MATPRODUCT_RARt: 3442 if (product->api_user) { 3443 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3444 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3445 PetscOptionsEnd(); 3446 } else { 3447 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3448 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3449 PetscOptionsEnd(); 3450 } 3451 break; 3452 case MATPRODUCT_ABC: 3453 if (product->api_user) { 3454 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3455 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3456 PetscOptionsEnd(); 3457 } else { 3458 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3459 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3460 PetscOptionsEnd(); 3461 } 3462 break; 3463 default: 3464 break; 3465 } 3466 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3467 } 3468 /* dispatch */ 3469 if (isdense) { 3470 switch (product->type) { 3471 case MATPRODUCT_AB: 3472 case MATPRODUCT_AtB: 3473 case MATPRODUCT_ABt: 3474 case MATPRODUCT_PtAP: 3475 case MATPRODUCT_RARt: 3476 if (product->A->boundtocpu) { 3477 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3478 } else { 3479 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3480 } 3481 break; 3482 case MATPRODUCT_ABC: 3483 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3484 break; 3485 default: 3486 break; 3487 } 3488 } else if (Biscusp && Ciscusp) { 3489 switch (product->type) { 3490 case MATPRODUCT_AB: 3491 case MATPRODUCT_AtB: 3492 case MATPRODUCT_ABt: 3493 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3494 break; 3495 case MATPRODUCT_PtAP: 3496 case MATPRODUCT_RARt: 3497 case MATPRODUCT_ABC: 3498 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3499 break; 3500 default: 3501 break; 3502 } 3503 } else { /* fallback for AIJ */ 3504 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3505 } 3506 PetscFunctionReturn(PETSC_SUCCESS); 3507 } 3508 3509 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3510 { 3511 PetscFunctionBegin; 3512 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3513 PetscFunctionReturn(PETSC_SUCCESS); 3514 } 3515 3516 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3517 { 3518 PetscFunctionBegin; 3519 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3520 PetscFunctionReturn(PETSC_SUCCESS); 3521 } 3522 3523 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3524 { 3525 PetscFunctionBegin; 3526 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3527 PetscFunctionReturn(PETSC_SUCCESS); 3528 } 3529 3530 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3531 { 3532 PetscFunctionBegin; 3533 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3534 PetscFunctionReturn(PETSC_SUCCESS); 3535 } 3536 3537 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3538 { 3539 PetscFunctionBegin; 3540 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3541 PetscFunctionReturn(PETSC_SUCCESS); 3542 } 3543 3544 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3545 { 3546 int i = blockIdx.x * blockDim.x + threadIdx.x; 3547 if (i < n) y[idx[i]] += x[i]; 3548 } 3549 3550 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3551 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3552 { 3553 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3554 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3555 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3556 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3557 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3558 PetscBool compressed; 3559 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3560 PetscInt nx, ny; 3561 #endif 3562 3563 PetscFunctionBegin; 3564 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3565 if (!a->nz) { 3566 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3567 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3568 PetscFunctionReturn(PETSC_SUCCESS); 3569 } 3570 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3571 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3572 if (!trans) { 3573 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3574 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3575 } else { 3576 if (herm || !A->form_explicit_transpose) { 3577 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3578 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3579 } else { 3580 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3581 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3582 } 3583 } 3584 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3585 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3586 3587 try { 3588 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3589 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3590 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3591 3592 PetscCall(PetscLogGpuTimeBegin()); 3593 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3594 /* z = A x + beta y. 3595 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3596 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3597 */ 3598 xptr = xarray; 3599 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3600 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3602 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3603 allocated to accommodate different uses. So we get the length info directly from mat. 3604 */ 3605 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3606 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3607 nx = mat->num_cols; // since y = Ax 3608 ny = mat->num_rows; 3609 } 3610 #endif 3611 } else { 3612 /* z = A^T x + beta y 3613 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3614 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3615 */ 3616 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3617 dptr = zarray; 3618 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3619 if (compressed) { /* Scatter x to work vector */ 3620 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3621 3622 thrust::for_each( 3623 #if PetscDefined(HAVE_THRUST_ASYNC) 3624 thrust::cuda::par.on(PetscDefaultCudaStream), 3625 #endif 3626 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3627 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3628 } 3629 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3630 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3631 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3632 nx = mat->num_rows; // since y = A^T x 3633 ny = mat->num_cols; 3634 } 3635 #endif 3636 } 3637 3638 /* csr_spmv does y = alpha op(A) x + beta y */ 3639 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3640 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3641 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3642 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3643 #else 3644 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3645 #endif 3646 3647 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3648 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3649 if (!matDescr) { 3650 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3651 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3652 } 3653 #endif 3654 3655 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3656 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3657 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3658 PetscCallCUSPARSE( 3659 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3660 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3661 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3662 PetscCallCUSPARSE( 3663 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3664 #endif 3665 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3666 } else { 3667 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3668 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3669 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3670 } 3671 3672 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3673 #else 3674 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3675 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3676 #endif 3677 } else { 3678 if (cusparsestruct->nrows) { 3679 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3680 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3681 #else 3682 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3683 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3684 #endif 3685 } 3686 } 3687 PetscCall(PetscLogGpuTimeEnd()); 3688 3689 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3690 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3691 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3692 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3693 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3694 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3695 } 3696 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3697 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3698 } 3699 3700 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3701 if (compressed) { 3702 PetscCall(PetscLogGpuTimeBegin()); 3703 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3704 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3705 PetscCall(PetscLogGpuTimeEnd()); 3706 } 3707 } else { 3708 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3709 } 3710 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3711 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3712 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3713 } catch (char *ex) { 3714 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3715 } 3716 if (yy) { 3717 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3718 } else { 3719 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3720 } 3721 PetscFunctionReturn(PETSC_SUCCESS); 3722 } 3723 3724 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3725 { 3726 PetscFunctionBegin; 3727 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3728 PetscFunctionReturn(PETSC_SUCCESS); 3729 } 3730 3731 PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx); 3732 3733 __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag) 3734 { 3735 const size_t x = blockIdx.x * blockDim.x + threadIdx.x; 3736 3737 if (x < len) { 3738 const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx; 3739 PetscScalar d = 0.0; 3740 3741 for (PetscInt i = 0; i < num_non0_row; i++) { 3742 if (col[i + rowx] == x) { 3743 d = val[i + rowx]; 3744 break; 3745 } 3746 } 3747 diag[x] = d; 3748 } 3749 } 3750 3751 static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag) 3752 { 3753 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3754 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3755 PetscScalar *darray; 3756 3757 PetscFunctionBegin; 3758 if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) { 3759 PetscInt n = A->rmap->n; 3760 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3761 3762 PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported"); 3763 if (n > 0) { 3764 PetscCall(VecCUDAGetArrayWrite(diag, &darray)); 3765 GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray); 3766 PetscCallCUDA(cudaPeekAtLastError()); 3767 PetscCall(VecCUDARestoreArrayWrite(diag, &darray)); 3768 } 3769 } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag)); 3770 PetscFunctionReturn(PETSC_SUCCESS); 3771 } 3772 3773 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3774 { 3775 PetscFunctionBegin; 3776 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3777 PetscFunctionReturn(PETSC_SUCCESS); 3778 } 3779 3780 /*@ 3781 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs 3782 3783 Collective 3784 3785 Input Parameters: 3786 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3787 . m - number of rows 3788 . n - number of columns 3789 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3790 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3791 3792 Output Parameter: 3793 . A - the matrix 3794 3795 Level: intermediate 3796 3797 Notes: 3798 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3799 calculations. For good matrix assembly performance the user should preallocate the matrix 3800 storage by setting the parameter `nz` (or the array `nnz`). 3801 3802 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3803 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3804 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3805 3806 The AIJ format, also called 3807 compressed row storage, is fully compatible with standard Fortran 3808 storage. That is, the stored row and column indices can begin at 3809 either one (as in Fortran) or zero. 3810 3811 Specify the preallocated storage with either nz or nnz (not both). 3812 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3813 allocation. 3814 3815 When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()` 3816 3817 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`, 3818 `MatSetPreallocationCOO()`, `MatSetValuesCOO()` 3819 @*/ 3820 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3821 { 3822 PetscFunctionBegin; 3823 PetscCall(MatCreate(comm, A)); 3824 PetscCall(MatSetSizes(*A, m, n, m, n)); 3825 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3826 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3827 PetscFunctionReturn(PETSC_SUCCESS); 3828 } 3829 3830 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3831 { 3832 PetscFunctionBegin; 3833 if (A->factortype == MAT_FACTOR_NONE) { 3834 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3835 } else { 3836 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3837 } 3838 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3839 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3840 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3841 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3842 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3843 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3844 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3845 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3846 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3847 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3848 PetscCall(MatDestroy_SeqAIJ(A)); 3849 PetscFunctionReturn(PETSC_SUCCESS); 3850 } 3851 3852 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3853 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3854 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3855 { 3856 PetscFunctionBegin; 3857 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3858 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3859 PetscFunctionReturn(PETSC_SUCCESS); 3860 } 3861 3862 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3863 { 3864 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3865 Mat_SeqAIJCUSPARSE *cy; 3866 Mat_SeqAIJCUSPARSE *cx; 3867 PetscScalar *ay; 3868 const PetscScalar *ax; 3869 CsrMatrix *csry, *csrx; 3870 3871 PetscFunctionBegin; 3872 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3873 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3874 if (X->ops->axpy != Y->ops->axpy) { 3875 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3876 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3877 PetscFunctionReturn(PETSC_SUCCESS); 3878 } 3879 /* if we are here, it means both matrices are bound to GPU */ 3880 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3881 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3882 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3883 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3884 csry = (CsrMatrix *)cy->mat->mat; 3885 csrx = (CsrMatrix *)cx->mat->mat; 3886 /* see if we can turn this into a cublas axpy */ 3887 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3888 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3889 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3890 if (eq) str = SAME_NONZERO_PATTERN; 3891 } 3892 /* spgeam is buggy with one column */ 3893 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3894 3895 if (str == SUBSET_NONZERO_PATTERN) { 3896 PetscScalar b = 1.0; 3897 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3898 size_t bufferSize; 3899 void *buffer; 3900 #endif 3901 3902 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3903 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3904 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3905 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3906 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3907 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3908 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3909 PetscCall(PetscLogGpuTimeBegin()); 3910 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3911 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3912 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3913 PetscCall(PetscLogGpuTimeEnd()); 3914 PetscCallCUDA(cudaFree(buffer)); 3915 #else 3916 PetscCall(PetscLogGpuTimeBegin()); 3917 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3918 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3919 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3920 PetscCall(PetscLogGpuTimeEnd()); 3921 #endif 3922 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3923 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3924 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3925 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3926 } else if (str == SAME_NONZERO_PATTERN) { 3927 cublasHandle_t cublasv2handle; 3928 PetscBLASInt one = 1, bnz = 1; 3929 3930 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3931 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3932 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3933 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3934 PetscCall(PetscLogGpuTimeBegin()); 3935 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3936 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3937 PetscCall(PetscLogGpuTimeEnd()); 3938 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3939 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3940 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3941 } else { 3942 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3943 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3944 } 3945 PetscFunctionReturn(PETSC_SUCCESS); 3946 } 3947 3948 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3949 { 3950 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3951 PetscScalar *ay; 3952 cublasHandle_t cublasv2handle; 3953 PetscBLASInt one = 1, bnz = 1; 3954 3955 PetscFunctionBegin; 3956 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3957 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3958 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3959 PetscCall(PetscLogGpuTimeBegin()); 3960 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3961 PetscCall(PetscLogGpuFlops(bnz)); 3962 PetscCall(PetscLogGpuTimeEnd()); 3963 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3964 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3965 PetscFunctionReturn(PETSC_SUCCESS); 3966 } 3967 3968 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3969 { 3970 PetscBool gpu = PETSC_FALSE; 3971 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3972 3973 PetscFunctionBegin; 3974 if (A->factortype == MAT_FACTOR_NONE) { 3975 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3976 if (spptr->mat) { 3977 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3978 if (matrix->values) { 3979 gpu = PETSC_TRUE; 3980 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3981 } 3982 } 3983 if (spptr->matTranspose) { 3984 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3985 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3986 } 3987 } 3988 if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU; 3989 else { 3990 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3991 A->offloadmask = PETSC_OFFLOAD_CPU; 3992 } 3993 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3994 PetscFunctionReturn(PETSC_SUCCESS); 3995 } 3996 3997 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m) 3998 { 3999 PetscFunctionBegin; 4000 *m = PETSC_MEMTYPE_CUDA; 4001 PetscFunctionReturn(PETSC_SUCCESS); 4002 } 4003 4004 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 4005 { 4006 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4007 4008 PetscFunctionBegin; 4009 if (A->factortype != MAT_FACTOR_NONE) { 4010 A->boundtocpu = flg; 4011 PetscFunctionReturn(PETSC_SUCCESS); 4012 } 4013 if (flg) { 4014 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4015 4016 A->ops->scale = MatScale_SeqAIJ; 4017 A->ops->getdiagonal = MatGetDiagonal_SeqAIJ; 4018 A->ops->axpy = MatAXPY_SeqAIJ; 4019 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4020 A->ops->mult = MatMult_SeqAIJ; 4021 A->ops->multadd = MatMultAdd_SeqAIJ; 4022 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4023 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4024 A->ops->multhermitiantranspose = NULL; 4025 A->ops->multhermitiantransposeadd = NULL; 4026 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4027 A->ops->getcurrentmemtype = NULL; 4028 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 4029 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 4030 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 4031 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 4032 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 4033 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 4034 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 4035 } else { 4036 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4037 A->ops->getdiagonal = MatGetDiagonal_SeqAIJCUSPARSE; 4038 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4039 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4040 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4041 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4042 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4043 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4044 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4045 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4046 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4047 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4048 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4049 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4050 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4051 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4052 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4053 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4054 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4055 4056 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4057 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4058 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4059 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4060 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4061 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4062 } 4063 A->boundtocpu = flg; 4064 if (flg && a->inode.size_csr) { 4065 a->inode.use = PETSC_TRUE; 4066 } else { 4067 a->inode.use = PETSC_FALSE; 4068 } 4069 PetscFunctionReturn(PETSC_SUCCESS); 4070 } 4071 4072 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4073 { 4074 Mat B; 4075 4076 PetscFunctionBegin; 4077 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4078 if (reuse == MAT_INITIAL_MATRIX) { 4079 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4080 } else if (reuse == MAT_REUSE_MATRIX) { 4081 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4082 } 4083 B = *newmat; 4084 4085 PetscCall(PetscFree(B->defaultvectype)); 4086 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4087 4088 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4089 if (B->factortype == MAT_FACTOR_NONE) { 4090 Mat_SeqAIJCUSPARSE *spptr; 4091 PetscCall(PetscNew(&spptr)); 4092 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4093 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4094 spptr->format = MAT_CUSPARSE_CSR; 4095 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4096 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4097 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4098 #else 4099 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4100 #endif 4101 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4102 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4103 #endif 4104 B->spptr = spptr; 4105 } else { 4106 Mat_SeqAIJCUSPARSETriFactors *spptr; 4107 4108 PetscCall(PetscNew(&spptr)); 4109 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4110 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4111 B->spptr = spptr; 4112 } 4113 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4114 } 4115 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4116 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4117 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4118 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4119 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4120 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4121 B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4122 4123 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4124 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4125 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4126 #if defined(PETSC_HAVE_HYPRE) 4127 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4128 #endif 4129 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4130 PetscFunctionReturn(PETSC_SUCCESS); 4131 } 4132 4133 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4134 { 4135 PetscFunctionBegin; 4136 PetscCall(MatCreate_SeqAIJ(B)); 4137 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4138 PetscFunctionReturn(PETSC_SUCCESS); 4139 } 4140 4141 /*MC 4142 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs. 4143 4144 Options Database Keys: 4145 + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4146 . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4147 Other options include ell (ellpack) or hyb (hybrid). 4148 . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4149 - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU 4150 4151 Level: beginner 4152 4153 Notes: 4154 These matrices can be in either CSR, ELL, or HYB format. 4155 4156 All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library. 4157 4158 Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens 4159 if some integer values passed in do not fit in `int`. 4160 4161 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4162 M*/ 4163 4164 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4165 { 4166 PetscFunctionBegin; 4167 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4168 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4169 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4170 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4171 PetscFunctionReturn(PETSC_SUCCESS); 4172 } 4173 4174 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4175 { 4176 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4177 4178 PetscFunctionBegin; 4179 if (cusp) { 4180 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4181 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4182 delete cusp->workVector; 4183 delete cusp->rowoffsets_gpu; 4184 delete cusp->csr2csc_i; 4185 delete cusp->coords; 4186 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4187 PetscCall(PetscFree(mat->spptr)); 4188 } 4189 PetscFunctionReturn(PETSC_SUCCESS); 4190 } 4191 4192 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4193 { 4194 PetscFunctionBegin; 4195 if (*mat) { 4196 delete (*mat)->values; 4197 delete (*mat)->column_indices; 4198 delete (*mat)->row_offsets; 4199 delete *mat; 4200 *mat = 0; 4201 } 4202 PetscFunctionReturn(PETSC_SUCCESS); 4203 } 4204 4205 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4206 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4207 { 4208 PetscFunctionBegin; 4209 if (*trifactor) { 4210 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4211 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4212 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4213 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4214 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4215 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4216 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4217 #endif 4218 PetscCall(PetscFree(*trifactor)); 4219 } 4220 PetscFunctionReturn(PETSC_SUCCESS); 4221 } 4222 #endif 4223 4224 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4225 { 4226 CsrMatrix *mat; 4227 4228 PetscFunctionBegin; 4229 if (*matstruct) { 4230 if ((*matstruct)->mat) { 4231 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4232 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4233 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4234 #else 4235 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4236 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4237 #endif 4238 } else { 4239 mat = (CsrMatrix *)(*matstruct)->mat; 4240 PetscCall(CsrMatrix_Destroy(&mat)); 4241 } 4242 } 4243 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4244 delete (*matstruct)->cprowIndices; 4245 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4246 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4247 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4248 4249 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4250 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4251 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4252 4253 for (int i = 0; i < 3; i++) { 4254 if (mdata->cuSpMV[i].initialized) { 4255 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4256 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4257 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4258 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4259 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4260 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4261 #endif 4262 } 4263 } 4264 #endif 4265 delete *matstruct; 4266 *matstruct = NULL; 4267 } 4268 PetscFunctionReturn(PETSC_SUCCESS); 4269 } 4270 4271 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4272 { 4273 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4274 4275 PetscFunctionBegin; 4276 if (fs) { 4277 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4278 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4279 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4280 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4281 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4282 delete fs->workVector; 4283 fs->workVector = NULL; 4284 #endif 4285 delete fs->rpermIndices; 4286 delete fs->cpermIndices; 4287 fs->rpermIndices = NULL; 4288 fs->cpermIndices = NULL; 4289 fs->init_dev_prop = PETSC_FALSE; 4290 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4291 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4292 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4293 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4294 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4295 PetscCallCUDA(cudaFree(fs->csrVal)); 4296 PetscCallCUDA(cudaFree(fs->diag)); 4297 PetscCallCUDA(cudaFree(fs->X)); 4298 PetscCallCUDA(cudaFree(fs->Y)); 4299 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4300 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4301 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4302 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4303 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4304 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4305 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4306 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4307 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4308 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4309 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4310 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4311 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4312 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4313 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4314 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4315 PetscCall(PetscFree(fs->csrRowPtr_h)); 4316 PetscCall(PetscFree(fs->csrVal_h)); 4317 PetscCall(PetscFree(fs->diag_h)); 4318 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4319 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4320 #endif 4321 } 4322 PetscFunctionReturn(PETSC_SUCCESS); 4323 } 4324 4325 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4326 { 4327 PetscFunctionBegin; 4328 if (*trifactors) { 4329 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4330 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4331 PetscCall(PetscFree(*trifactors)); 4332 } 4333 PetscFunctionReturn(PETSC_SUCCESS); 4334 } 4335 4336 struct IJCompare { 4337 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4338 { 4339 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4340 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4341 return false; 4342 } 4343 }; 4344 4345 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4346 { 4347 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4348 4349 PetscFunctionBegin; 4350 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4351 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4352 if (destroy) { 4353 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4354 delete cusp->csr2csc_i; 4355 cusp->csr2csc_i = NULL; 4356 } 4357 A->transupdated = PETSC_FALSE; 4358 PetscFunctionReturn(PETSC_SUCCESS); 4359 } 4360 4361 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4362 { 4363 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 4364 4365 PetscFunctionBegin; 4366 PetscCallCUDA(cudaFree(coo->perm)); 4367 PetscCallCUDA(cudaFree(coo->jmap)); 4368 PetscCall(PetscFree(coo)); 4369 PetscFunctionReturn(PETSC_SUCCESS); 4370 } 4371 4372 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4373 { 4374 PetscBool dev_ij = PETSC_FALSE; 4375 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4376 PetscInt *i, *j; 4377 PetscContainer container_h; 4378 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4379 4380 PetscFunctionBegin; 4381 PetscCall(PetscGetMemType(coo_i, &mtype)); 4382 if (PetscMemTypeDevice(mtype)) { 4383 dev_ij = PETSC_TRUE; 4384 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4385 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4386 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4387 } else { 4388 i = coo_i; 4389 j = coo_j; 4390 } 4391 4392 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4393 if (dev_ij) PetscCall(PetscFree2(i, j)); 4394 mat->offloadmask = PETSC_OFFLOAD_CPU; 4395 // Create the GPU memory 4396 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4397 4398 // Copy the COO struct to device 4399 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4400 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4401 PetscCall(PetscMalloc1(1, &coo_d)); 4402 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4403 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4404 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4405 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4406 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4407 4408 // Put the COO struct in a container and then attach that to the matrix 4409 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4410 PetscFunctionReturn(PETSC_SUCCESS); 4411 } 4412 4413 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4414 { 4415 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4416 const PetscCount grid_size = gridDim.x * blockDim.x; 4417 for (; i < nnz; i += grid_size) { 4418 PetscScalar sum = 0.0; 4419 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4420 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4421 } 4422 } 4423 4424 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4425 { 4426 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4427 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4428 PetscCount Annz = seq->nz; 4429 PetscMemType memtype; 4430 const PetscScalar *v1 = v; 4431 PetscScalar *Aa; 4432 PetscContainer container; 4433 MatCOOStruct_SeqAIJ *coo; 4434 4435 PetscFunctionBegin; 4436 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4437 4438 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4439 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4440 4441 PetscCall(PetscGetMemType(v, &memtype)); 4442 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4443 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4444 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4445 } 4446 4447 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4448 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4449 4450 PetscCall(PetscLogGpuTimeBegin()); 4451 if (Annz) { 4452 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4453 PetscCallCUDA(cudaPeekAtLastError()); 4454 } 4455 PetscCall(PetscLogGpuTimeEnd()); 4456 4457 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4458 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4459 4460 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4461 PetscFunctionReturn(PETSC_SUCCESS); 4462 } 4463 4464 /*@C 4465 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4466 4467 Not Collective 4468 4469 Input Parameters: 4470 + A - the matrix 4471 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4472 4473 Output Parameters: 4474 + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 4475 - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 4476 4477 Level: developer 4478 4479 Note: 4480 When compressed is true, the CSR structure does not contain empty rows 4481 4482 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4483 @*/ 4484 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4485 { 4486 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4487 CsrMatrix *csr; 4488 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4489 4490 PetscFunctionBegin; 4491 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4492 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4493 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4494 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4495 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4496 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4497 csr = (CsrMatrix *)cusp->mat->mat; 4498 if (i) { 4499 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4500 if (!cusp->rowoffsets_gpu) { 4501 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4502 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4503 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4504 } 4505 *i = cusp->rowoffsets_gpu->data().get(); 4506 } else *i = csr->row_offsets->data().get(); 4507 } 4508 if (j) *j = csr->column_indices->data().get(); 4509 PetscFunctionReturn(PETSC_SUCCESS); 4510 } 4511 4512 /*@C 4513 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4514 4515 Not Collective 4516 4517 Input Parameters: 4518 + A - the matrix 4519 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4520 . i - the CSR row pointers 4521 - j - the CSR column indices 4522 4523 Level: developer 4524 4525 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4526 @*/ 4527 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4528 { 4529 PetscFunctionBegin; 4530 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4531 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4532 if (i) *i = NULL; 4533 if (j) *j = NULL; 4534 (void)compressed; 4535 PetscFunctionReturn(PETSC_SUCCESS); 4536 } 4537 4538 /*@C 4539 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored 4540 4541 Not Collective 4542 4543 Input Parameter: 4544 . A - a `MATSEQAIJCUSPARSE` matrix 4545 4546 Output Parameter: 4547 . a - pointer to the device data 4548 4549 Level: developer 4550 4551 Note: 4552 Will trigger host-to-device copies if the most up-to-date matrix data is on the host 4553 4554 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4555 @*/ 4556 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4557 { 4558 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4559 CsrMatrix *csr; 4560 4561 PetscFunctionBegin; 4562 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4563 PetscAssertPointer(a, 2); 4564 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4565 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4566 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4567 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4568 csr = (CsrMatrix *)cusp->mat->mat; 4569 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4570 *a = csr->values->data().get(); 4571 PetscFunctionReturn(PETSC_SUCCESS); 4572 } 4573 4574 /*@C 4575 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4576 4577 Not Collective 4578 4579 Input Parameters: 4580 + A - a `MATSEQAIJCUSPARSE` matrix 4581 - a - pointer to the device data 4582 4583 Level: developer 4584 4585 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4586 @*/ 4587 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4588 { 4589 PetscFunctionBegin; 4590 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4591 PetscAssertPointer(a, 2); 4592 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4593 *a = NULL; 4594 PetscFunctionReturn(PETSC_SUCCESS); 4595 } 4596 4597 /*@C 4598 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4599 4600 Not Collective 4601 4602 Input Parameter: 4603 . A - a `MATSEQAIJCUSPARSE` matrix 4604 4605 Output Parameter: 4606 . a - pointer to the device data 4607 4608 Level: developer 4609 4610 Note: 4611 Will trigger host-to-device copies if the most up-to-date matrix data is on the host 4612 4613 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4614 @*/ 4615 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4616 { 4617 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4618 CsrMatrix *csr; 4619 4620 PetscFunctionBegin; 4621 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4622 PetscAssertPointer(a, 2); 4623 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4624 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4625 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4626 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4627 csr = (CsrMatrix *)cusp->mat->mat; 4628 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4629 *a = csr->values->data().get(); 4630 A->offloadmask = PETSC_OFFLOAD_GPU; 4631 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4632 PetscFunctionReturn(PETSC_SUCCESS); 4633 } 4634 /*@C 4635 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4636 4637 Not Collective 4638 4639 Input Parameters: 4640 + A - a `MATSEQAIJCUSPARSE` matrix 4641 - a - pointer to the device data 4642 4643 Level: developer 4644 4645 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4646 @*/ 4647 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4648 { 4649 PetscFunctionBegin; 4650 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4651 PetscAssertPointer(a, 2); 4652 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4653 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4654 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4655 *a = NULL; 4656 PetscFunctionReturn(PETSC_SUCCESS); 4657 } 4658 4659 /*@C 4660 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4661 4662 Not Collective 4663 4664 Input Parameter: 4665 . A - a `MATSEQAIJCUSPARSE` matrix 4666 4667 Output Parameter: 4668 . a - pointer to the device data 4669 4670 Level: developer 4671 4672 Note: 4673 Does not trigger any host to device copies. 4674 4675 It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current 4676 4677 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4678 @*/ 4679 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4680 { 4681 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4682 CsrMatrix *csr; 4683 4684 PetscFunctionBegin; 4685 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4686 PetscAssertPointer(a, 2); 4687 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4688 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4689 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4690 csr = (CsrMatrix *)cusp->mat->mat; 4691 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4692 *a = csr->values->data().get(); 4693 A->offloadmask = PETSC_OFFLOAD_GPU; 4694 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4695 PetscFunctionReturn(PETSC_SUCCESS); 4696 } 4697 4698 /*@C 4699 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4700 4701 Not Collective 4702 4703 Input Parameters: 4704 + A - a `MATSEQAIJCUSPARSE` matrix 4705 - a - pointer to the device data 4706 4707 Level: developer 4708 4709 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4710 @*/ 4711 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4712 { 4713 PetscFunctionBegin; 4714 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4715 PetscAssertPointer(a, 2); 4716 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4717 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4718 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4719 *a = NULL; 4720 PetscFunctionReturn(PETSC_SUCCESS); 4721 } 4722 4723 struct IJCompare4 { 4724 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4725 { 4726 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4727 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4728 return false; 4729 } 4730 }; 4731 4732 struct Shift { 4733 int _shift; 4734 4735 Shift(int shift) : _shift(shift) { } 4736 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4737 }; 4738 4739 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4740 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4741 { 4742 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4743 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4744 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4745 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4746 PetscInt Annz, Bnnz; 4747 cusparseStatus_t stat; 4748 PetscInt i, m, n, zero = 0; 4749 4750 PetscFunctionBegin; 4751 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4752 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4753 PetscAssertPointer(C, 4); 4754 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4755 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4756 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4757 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4758 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4759 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4760 if (reuse == MAT_INITIAL_MATRIX) { 4761 m = A->rmap->n; 4762 n = A->cmap->n + B->cmap->n; 4763 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4764 PetscCall(MatSetSizes(*C, m, n, m, n)); 4765 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4766 c = (Mat_SeqAIJ *)(*C)->data; 4767 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4768 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4769 Ccsr = new CsrMatrix; 4770 Cmat->cprowIndices = NULL; 4771 c->compressedrow.use = PETSC_FALSE; 4772 c->compressedrow.nrows = 0; 4773 c->compressedrow.i = NULL; 4774 c->compressedrow.rindex = NULL; 4775 Ccusp->workVector = NULL; 4776 Ccusp->nrows = m; 4777 Ccusp->mat = Cmat; 4778 Ccusp->mat->mat = Ccsr; 4779 Ccsr->num_rows = m; 4780 Ccsr->num_cols = n; 4781 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4782 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4783 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4784 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4785 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4786 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4787 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4788 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4789 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4790 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4791 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4792 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4793 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4794 4795 Acsr = (CsrMatrix *)Acusp->mat->mat; 4796 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4797 Annz = (PetscInt)Acsr->column_indices->size(); 4798 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4799 c->nz = Annz + Bnnz; 4800 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4801 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4802 Ccsr->values = new THRUSTARRAY(c->nz); 4803 Ccsr->num_entries = c->nz; 4804 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4805 if (c->nz) { 4806 auto Acoo = new THRUSTINTARRAY32(Annz); 4807 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4808 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4809 THRUSTINTARRAY32 *Aroff, *Broff; 4810 4811 if (a->compressedrow.use) { /* need full row offset */ 4812 if (!Acusp->rowoffsets_gpu) { 4813 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4814 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4815 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4816 } 4817 Aroff = Acusp->rowoffsets_gpu; 4818 } else Aroff = Acsr->row_offsets; 4819 if (b->compressedrow.use) { /* need full row offset */ 4820 if (!Bcusp->rowoffsets_gpu) { 4821 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4822 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4823 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4824 } 4825 Broff = Bcusp->rowoffsets_gpu; 4826 } else Broff = Bcsr->row_offsets; 4827 PetscCall(PetscLogGpuTimeBegin()); 4828 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4829 PetscCallCUSPARSE(stat); 4830 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4831 PetscCallCUSPARSE(stat); 4832 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4833 auto Aperm = thrust::make_constant_iterator(1); 4834 auto Bperm = thrust::make_constant_iterator(0); 4835 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4836 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4837 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4838 #else 4839 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4840 auto Bcib = Bcsr->column_indices->begin(); 4841 auto Bcie = Bcsr->column_indices->end(); 4842 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4843 #endif 4844 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4845 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4846 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4847 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4848 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4849 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4850 auto p1 = Ccusp->coords->begin(); 4851 auto p2 = Ccusp->coords->begin(); 4852 thrust::advance(p2, Annz); 4853 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4854 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4855 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4856 #endif 4857 auto cci = thrust::make_counting_iterator(zero); 4858 auto cce = thrust::make_counting_iterator(c->nz); 4859 #if 0 //Errors on SUMMIT cuda 11.1.0 4860 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4861 #else 4862 #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST) 4863 auto pred = thrust::identity<int>(); 4864 #else 4865 auto pred = cuda::std::identity(); 4866 #endif 4867 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4868 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4869 #endif 4870 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4871 PetscCallCUSPARSE(stat); 4872 PetscCall(PetscLogGpuTimeEnd()); 4873 delete wPerm; 4874 delete Acoo; 4875 delete Bcoo; 4876 delete Ccoo; 4877 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4878 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4879 PetscCallCUSPARSE(stat); 4880 #endif 4881 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4882 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4883 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4884 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4885 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4886 CsrMatrix *CcsrT = new CsrMatrix; 4887 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4888 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4889 4890 (*C)->form_explicit_transpose = PETSC_TRUE; 4891 (*C)->transupdated = PETSC_TRUE; 4892 Ccusp->rowoffsets_gpu = NULL; 4893 CmatT->cprowIndices = NULL; 4894 CmatT->mat = CcsrT; 4895 CcsrT->num_rows = n; 4896 CcsrT->num_cols = m; 4897 CcsrT->num_entries = c->nz; 4898 4899 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4900 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4901 CcsrT->values = new THRUSTARRAY(c->nz); 4902 4903 PetscCall(PetscLogGpuTimeBegin()); 4904 auto rT = CcsrT->row_offsets->begin(); 4905 if (AT) { 4906 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4907 thrust::advance(rT, -1); 4908 } 4909 if (BT) { 4910 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4911 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4912 thrust::copy(titb, tite, rT); 4913 } 4914 auto cT = CcsrT->column_indices->begin(); 4915 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4916 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4917 auto vT = CcsrT->values->begin(); 4918 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4919 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4920 PetscCall(PetscLogGpuTimeEnd()); 4921 4922 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4923 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4924 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4925 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4926 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4927 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4928 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4929 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4930 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4931 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4932 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4933 PetscCallCUSPARSE(stat); 4934 #endif 4935 Ccusp->matTranspose = CmatT; 4936 } 4937 } 4938 4939 c->free_a = PETSC_TRUE; 4940 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4941 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4942 c->free_ij = PETSC_TRUE; 4943 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4944 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4945 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4946 ii = *Ccsr->row_offsets; 4947 jj = *Ccsr->column_indices; 4948 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4949 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4950 } else { 4951 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4952 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4953 } 4954 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4955 PetscCall(PetscMalloc1(m, &c->ilen)); 4956 PetscCall(PetscMalloc1(m, &c->imax)); 4957 c->maxnz = c->nz; 4958 c->nonzerorowcnt = 0; 4959 c->rmax = 0; 4960 for (i = 0; i < m; i++) { 4961 const PetscInt nn = c->i[i + 1] - c->i[i]; 4962 c->ilen[i] = c->imax[i] = nn; 4963 c->nonzerorowcnt += (PetscInt)!!nn; 4964 c->rmax = PetscMax(c->rmax, nn); 4965 } 4966 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4967 PetscCall(PetscMalloc1(c->nz, &c->a)); 4968 (*C)->nonzerostate++; 4969 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4970 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4971 Ccusp->nonzerostate = (*C)->nonzerostate; 4972 (*C)->preallocated = PETSC_TRUE; 4973 } else { 4974 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4975 c = (Mat_SeqAIJ *)(*C)->data; 4976 if (c->nz) { 4977 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4978 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4979 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4980 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4981 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4982 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4983 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4984 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4985 Acsr = (CsrMatrix *)Acusp->mat->mat; 4986 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4987 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4988 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4989 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4990 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4991 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4992 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4993 auto pmid = Ccusp->coords->begin(); 4994 thrust::advance(pmid, Acsr->num_entries); 4995 PetscCall(PetscLogGpuTimeBegin()); 4996 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4997 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4998 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4999 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 5000 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 5001 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 5002 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 5003 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 5004 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5005 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5006 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 5007 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 5008 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 5009 auto vT = CcsrT->values->begin(); 5010 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 5011 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 5012 (*C)->transupdated = PETSC_TRUE; 5013 } 5014 PetscCall(PetscLogGpuTimeEnd()); 5015 } 5016 } 5017 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5018 (*C)->assembled = PETSC_TRUE; 5019 (*C)->was_assembled = PETSC_FALSE; 5020 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5021 PetscFunctionReturn(PETSC_SUCCESS); 5022 } 5023 5024 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5025 { 5026 bool dmem; 5027 const PetscScalar *av; 5028 5029 PetscFunctionBegin; 5030 dmem = isCudaMem(v); 5031 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5032 if (n && idx) { 5033 THRUSTINTARRAY widx(n); 5034 widx.assign(idx, idx + n); 5035 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5036 5037 THRUSTARRAY *w = NULL; 5038 thrust::device_ptr<PetscScalar> dv; 5039 if (dmem) { 5040 dv = thrust::device_pointer_cast(v); 5041 } else { 5042 w = new THRUSTARRAY(n); 5043 dv = w->data(); 5044 } 5045 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5046 5047 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5048 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5049 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5050 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5051 delete w; 5052 } else { 5053 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5054 } 5055 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5056 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5057 PetscFunctionReturn(PETSC_SUCCESS); 5058 } 5059