1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #endif 19 #include <thrust/iterator/constant_iterator.h> 20 #include <thrust/remove.h> 21 #include <thrust/sort.h> 22 #include <thrust/unique.h> 23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST) 24 #include <cuda/std/functional> 25 #endif 26 27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 29 /* 30 The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 31 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 32 */ 33 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 34 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 36 #endif 37 38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 48 #endif 49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 59 60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 64 65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 67 68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 71 72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 73 { 74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 75 76 PetscFunctionBegin; 77 switch (op) { 78 case MAT_CUSPARSE_MULT: 79 cusparsestruct->format = format; 80 break; 81 case MAT_CUSPARSE_ALL: 82 cusparsestruct->format = format; 83 break; 84 default: 85 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 86 } 87 PetscFunctionReturn(PETSC_SUCCESS); 88 } 89 90 /*@ 91 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 92 operation. Only the `MatMult()` operation can use different GPU storage formats 93 94 Not Collective 95 96 Input Parameters: 97 + A - Matrix of type `MATSEQAIJCUSPARSE` 98 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 99 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 101 102 Level: intermediate 103 104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 105 @*/ 106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 107 { 108 PetscFunctionBegin; 109 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 110 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 117 118 PetscFunctionBegin; 119 cusparsestruct->use_cpu_solve = use_cpu; 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /*@ 124 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 125 126 Input Parameters: 127 + A - Matrix of type `MATSEQAIJCUSPARSE` 128 - use_cpu - set flag for using the built-in CPU `MatSolve()` 129 130 Level: intermediate 131 132 Note: 133 The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method 134 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there. 135 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 136 137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 138 @*/ 139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 140 { 141 PetscFunctionBegin; 142 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 143 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 148 { 149 PetscFunctionBegin; 150 switch (op) { 151 case MAT_FORM_EXPLICIT_TRANSPOSE: 152 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 153 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 154 A->form_explicit_transpose = flg; 155 break; 156 default: 157 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 158 break; 159 } 160 PetscFunctionReturn(PETSC_SUCCESS); 161 } 162 163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 164 { 165 MatCUSPARSEStorageFormat format; 166 PetscBool flg; 167 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 168 169 PetscFunctionBegin; 170 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 171 if (A->factortype == MAT_FACTOR_NONE) { 172 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 173 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 174 175 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 176 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 177 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 178 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 180 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 181 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 182 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 183 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 184 #else 185 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 186 #endif 187 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 188 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 189 190 PetscCall( 191 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 192 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 193 #endif 194 } 195 PetscOptionsHeadEnd(); 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 201 { 202 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 203 PetscInt m = A->rmap->n; 204 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 205 const PetscInt *Ai = a->i, *Aj = a->j, *adiag; 206 const MatScalar *Aa = a->a; 207 PetscInt *Mi, *Mj, Mnz; 208 PetscScalar *Ma; 209 210 PetscFunctionBegin; 211 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL)); 212 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 213 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 214 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 215 Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 216 PetscCall(PetscMalloc1(m + 1, &Mi)); 217 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 218 PetscCall(PetscMalloc1(Mnz, &Ma)); 219 Mi[0] = 0; 220 for (PetscInt i = 0; i < m; i++) { 221 PetscInt llen = Ai[i + 1] - Ai[i]; 222 PetscInt ulen = adiag[i] - adiag[i + 1]; 223 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 224 Mj[Mi[i] + llen] = i; // diagonal entry 225 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 226 Mi[i + 1] = Mi[i] + llen + ulen; 227 } 228 // Copy M (L,U) from host to device 229 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 230 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 231 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 232 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 233 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 234 235 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 236 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 237 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 238 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 239 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 240 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 241 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 242 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 243 244 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 245 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 246 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 247 248 fillMode = CUSPARSE_FILL_MODE_UPPER; 249 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 250 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 251 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 252 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 253 254 // Allocate work vectors in SpSv 255 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 256 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 257 258 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 259 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 260 261 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 262 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 263 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 264 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 265 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 266 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 267 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 268 269 // Record for reuse 270 fs->csrRowPtr_h = Mi; 271 fs->csrVal_h = Ma; 272 PetscCall(PetscFree(Mj)); 273 } 274 // Copy the value 275 Mi = fs->csrRowPtr_h; 276 Ma = fs->csrVal_h; 277 Mnz = Mi[m]; 278 for (PetscInt i = 0; i < m; i++) { 279 PetscInt llen = Ai[i + 1] - Ai[i]; 280 PetscInt ulen = adiag[i] - adiag[i + 1]; 281 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 282 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]]; // recover the diagonal entry 283 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 284 } 285 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 286 287 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 288 if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 289 // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 290 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 291 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 292 } else 293 #endif 294 { 295 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 296 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 297 298 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 299 fs->updatedSpSVAnalysis = PETSC_TRUE; 300 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 301 } 302 } 303 PetscFunctionReturn(PETSC_SUCCESS); 304 } 305 #else 306 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 307 { 308 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 309 PetscInt n = A->rmap->n; 310 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 311 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 312 const PetscInt *ai = a->i, *aj = a->j, *vi; 313 const MatScalar *aa = a->a, *v; 314 PetscInt *AiLo, *AjLo; 315 PetscInt i, nz, nzLower, offset, rowOffset; 316 317 PetscFunctionBegin; 318 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 319 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 320 try { 321 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 322 nzLower = n + ai[n] - ai[1]; 323 if (!loTriFactor) { 324 PetscScalar *AALo; 325 326 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 327 328 /* Allocate Space for the lower triangular matrix */ 329 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 330 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 331 332 /* Fill the lower triangular matrix */ 333 AiLo[0] = (PetscInt)0; 334 AiLo[n] = nzLower; 335 AjLo[0] = (PetscInt)0; 336 AALo[0] = (MatScalar)1.0; 337 v = aa; 338 vi = aj; 339 offset = 1; 340 rowOffset = 1; 341 for (i = 1; i < n; i++) { 342 nz = ai[i + 1] - ai[i]; 343 /* additional 1 for the term on the diagonal */ 344 AiLo[i] = rowOffset; 345 rowOffset += nz + 1; 346 347 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 348 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 349 350 offset += nz; 351 AjLo[offset] = (PetscInt)i; 352 AALo[offset] = (MatScalar)1.0; 353 offset += 1; 354 355 v += nz; 356 vi += nz; 357 } 358 359 /* allocate space for the triangular factor information */ 360 PetscCall(PetscNew(&loTriFactor)); 361 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 362 /* Create the matrix description */ 363 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 364 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 365 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 366 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 367 #else 368 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 369 #endif 370 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 371 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 372 373 /* set the operation */ 374 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 375 376 /* set the matrix */ 377 loTriFactor->csrMat = new CsrMatrix; 378 loTriFactor->csrMat->num_rows = n; 379 loTriFactor->csrMat->num_cols = n; 380 loTriFactor->csrMat->num_entries = nzLower; 381 382 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 383 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 384 385 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 386 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 387 388 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 389 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 390 391 /* Create the solve analysis information */ 392 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 393 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 394 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 395 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 396 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 397 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 398 #endif 399 400 /* perform the solve analysis */ 401 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 402 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 403 PetscCallCUDA(WaitForCUDA()); 404 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 405 406 /* assign the pointer */ 407 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 408 loTriFactor->AA_h = AALo; 409 PetscCallCUDA(cudaFreeHost(AiLo)); 410 PetscCallCUDA(cudaFreeHost(AjLo)); 411 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 412 } else { /* update values only */ 413 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 414 /* Fill the lower triangular matrix */ 415 loTriFactor->AA_h[0] = 1.0; 416 v = aa; 417 vi = aj; 418 offset = 1; 419 for (i = 1; i < n; i++) { 420 nz = ai[i + 1] - ai[i]; 421 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 422 offset += nz; 423 loTriFactor->AA_h[offset] = 1.0; 424 offset += 1; 425 v += nz; 426 } 427 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 428 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 429 } 430 } catch (char *ex) { 431 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 432 } 433 } 434 PetscFunctionReturn(PETSC_SUCCESS); 435 } 436 437 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 438 { 439 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 440 PetscInt n = A->rmap->n; 441 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 442 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 443 const PetscInt *aj = a->j, *adiag, *vi; 444 const MatScalar *aa = a->a, *v; 445 PetscInt *AiUp, *AjUp; 446 PetscInt i, nz, nzUpper, offset; 447 448 PetscFunctionBegin; 449 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 450 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL)); 451 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 452 try { 453 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 454 nzUpper = adiag[0] - adiag[n]; 455 if (!upTriFactor) { 456 PetscScalar *AAUp; 457 458 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 459 460 /* Allocate Space for the upper triangular matrix */ 461 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 462 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 463 464 /* Fill the upper triangular matrix */ 465 AiUp[0] = (PetscInt)0; 466 AiUp[n] = nzUpper; 467 offset = nzUpper; 468 for (i = n - 1; i >= 0; i--) { 469 v = aa + adiag[i + 1] + 1; 470 vi = aj + adiag[i + 1] + 1; 471 472 /* number of elements NOT on the diagonal */ 473 nz = adiag[i] - adiag[i + 1] - 1; 474 475 /* decrement the offset */ 476 offset -= (nz + 1); 477 478 /* first, set the diagonal elements */ 479 AjUp[offset] = (PetscInt)i; 480 AAUp[offset] = (MatScalar)1. / v[nz]; 481 AiUp[i] = AiUp[i + 1] - (nz + 1); 482 483 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 484 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 485 } 486 487 /* allocate space for the triangular factor information */ 488 PetscCall(PetscNew(&upTriFactor)); 489 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 490 491 /* Create the matrix description */ 492 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 493 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 494 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 495 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 496 #else 497 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 498 #endif 499 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 500 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 501 502 /* set the operation */ 503 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 504 505 /* set the matrix */ 506 upTriFactor->csrMat = new CsrMatrix; 507 upTriFactor->csrMat->num_rows = n; 508 upTriFactor->csrMat->num_cols = n; 509 upTriFactor->csrMat->num_entries = nzUpper; 510 511 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 512 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 513 514 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 515 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 516 517 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 518 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 519 520 /* Create the solve analysis information */ 521 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 522 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 523 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 524 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 525 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 526 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 527 #endif 528 529 /* perform the solve analysis */ 530 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 531 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 532 533 PetscCallCUDA(WaitForCUDA()); 534 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 535 536 /* assign the pointer */ 537 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 538 upTriFactor->AA_h = AAUp; 539 PetscCallCUDA(cudaFreeHost(AiUp)); 540 PetscCallCUDA(cudaFreeHost(AjUp)); 541 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 542 } else { 543 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 544 /* Fill the upper triangular matrix */ 545 offset = nzUpper; 546 for (i = n - 1; i >= 0; i--) { 547 v = aa + adiag[i + 1] + 1; 548 549 /* number of elements NOT on the diagonal */ 550 nz = adiag[i] - adiag[i + 1] - 1; 551 552 /* decrement the offset */ 553 offset -= (nz + 1); 554 555 /* first, set the diagonal elements */ 556 upTriFactor->AA_h[offset] = 1. / v[nz]; 557 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 558 } 559 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 560 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 561 } 562 } catch (char *ex) { 563 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 564 } 565 } 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 #endif 569 570 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 571 { 572 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 573 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 574 IS isrow = a->row, isicol = a->icol; 575 PetscBool row_identity, col_identity; 576 PetscInt n = A->rmap->n; 577 578 PetscFunctionBegin; 579 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 580 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 581 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 582 #else 583 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 584 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 585 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 586 #endif 587 588 cusparseTriFactors->nnz = a->nz; 589 590 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 591 /* lower triangular indices */ 592 PetscCall(ISIdentity(isrow, &row_identity)); 593 if (!row_identity && !cusparseTriFactors->rpermIndices) { 594 const PetscInt *r; 595 596 PetscCall(ISGetIndices(isrow, &r)); 597 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 598 cusparseTriFactors->rpermIndices->assign(r, r + n); 599 PetscCall(ISRestoreIndices(isrow, &r)); 600 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 601 } 602 603 /* upper triangular indices */ 604 PetscCall(ISIdentity(isicol, &col_identity)); 605 if (!col_identity && !cusparseTriFactors->cpermIndices) { 606 const PetscInt *c; 607 608 PetscCall(ISGetIndices(isicol, &c)); 609 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 610 cusparseTriFactors->cpermIndices->assign(c, c + n); 611 PetscCall(ISRestoreIndices(isicol, &c)); 612 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 613 } 614 PetscFunctionReturn(PETSC_SUCCESS); 615 } 616 617 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 618 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 619 { 620 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 621 PetscInt m = A->rmap->n; 622 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 623 const PetscInt *Ai = a->i, *Aj = a->j, *adiag; 624 const MatScalar *Aa = a->a; 625 PetscInt *Mj, Mnz; 626 PetscScalar *Ma, *D; 627 628 PetscFunctionBegin; 629 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL)); 630 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 631 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 632 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 633 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 634 Mnz = Ai[m]; // Unz (with the unit diagonal) 635 PetscCall(PetscMalloc1(Mnz, &Ma)); 636 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 637 PetscCall(PetscMalloc1(m, &D)); // the diagonal 638 for (PetscInt i = 0; i < m; i++) { 639 PetscInt ulen = Ai[i + 1] - Ai[i]; 640 Mj[Ai[i]] = i; // diagonal entry 641 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 642 } 643 // Copy M (U) from host to device 644 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 645 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 646 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 647 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 648 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 649 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 650 651 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 652 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 653 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 654 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 655 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 656 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 657 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 658 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 659 660 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 661 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 662 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 663 664 // Allocate work vectors in SpSv 665 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 666 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 667 668 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 669 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 670 671 // Query buffer sizes for SpSV and then allocate buffers 672 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 673 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 674 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 675 676 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 677 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 678 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 679 680 // Record for reuse 681 fs->csrVal_h = Ma; 682 fs->diag_h = D; 683 PetscCall(PetscFree(Mj)); 684 } 685 // Copy the value 686 Ma = fs->csrVal_h; 687 D = fs->diag_h; 688 Mnz = Ai[m]; 689 for (PetscInt i = 0; i < m; i++) { 690 D[i] = Aa[adiag[i]]; // actually Aa[adiag[i]] is the inverse of the diagonal 691 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 692 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 693 } 694 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 695 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 696 697 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 698 if (fs->updatedSpSVAnalysis) { 699 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 700 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 701 } else 702 #endif 703 { 704 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 705 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 706 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 707 fs->updatedSpSVAnalysis = PETSC_TRUE; 708 } 709 } 710 PetscFunctionReturn(PETSC_SUCCESS); 711 } 712 713 // Solve Ut D U x = b 714 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 715 { 716 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 717 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 718 const PetscScalar *barray; 719 PetscScalar *xarray; 720 thrust::device_ptr<const PetscScalar> bGPU; 721 thrust::device_ptr<PetscScalar> xGPU; 722 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 723 PetscInt m = A->rmap->n; 724 725 PetscFunctionBegin; 726 PetscCall(PetscLogGpuTimeBegin()); 727 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 728 PetscCall(VecCUDAGetArrayRead(b, &barray)); 729 xGPU = thrust::device_pointer_cast(xarray); 730 bGPU = thrust::device_pointer_cast(barray); 731 732 // Reorder b with the row permutation if needed, and wrap the result in fs->X 733 if (fs->rpermIndices) { 734 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 735 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 736 } else { 737 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 738 } 739 740 // Solve Ut Y = X 741 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 742 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 743 744 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 745 // It is basically a vector element-wise multiplication, but cublas does not have it! 746 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 747 748 // Solve U X = Y 749 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 750 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 751 } else { 752 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 753 } 754 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 755 756 // Reorder X with the column permutation if needed, and put the result back to x 757 if (fs->cpermIndices) { 758 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 759 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 760 } 761 762 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 763 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 764 PetscCall(PetscLogGpuTimeEnd()); 765 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 766 PetscFunctionReturn(PETSC_SUCCESS); 767 } 768 #else 769 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 770 { 771 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 772 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 773 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 774 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 775 PetscInt *AiUp, *AjUp; 776 PetscScalar *AAUp; 777 PetscScalar *AALo; 778 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 779 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 780 const PetscInt *ai = b->i, *aj = b->j, *vj; 781 const MatScalar *aa = b->a, *v; 782 783 PetscFunctionBegin; 784 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 785 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 786 try { 787 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 788 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 789 if (!upTriFactor && !loTriFactor) { 790 /* Allocate Space for the upper triangular matrix */ 791 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 792 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 793 794 /* Fill the upper triangular matrix */ 795 AiUp[0] = (PetscInt)0; 796 AiUp[n] = nzUpper; 797 offset = 0; 798 for (i = 0; i < n; i++) { 799 /* set the pointers */ 800 v = aa + ai[i]; 801 vj = aj + ai[i]; 802 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 803 804 /* first, set the diagonal elements */ 805 AjUp[offset] = (PetscInt)i; 806 AAUp[offset] = (MatScalar)1.0 / v[nz]; 807 AiUp[i] = offset; 808 AALo[offset] = (MatScalar)1.0 / v[nz]; 809 810 offset += 1; 811 if (nz > 0) { 812 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 813 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 814 for (j = offset; j < offset + nz; j++) { 815 AAUp[j] = -AAUp[j]; 816 AALo[j] = AAUp[j] / v[nz]; 817 } 818 offset += nz; 819 } 820 } 821 822 /* allocate space for the triangular factor information */ 823 PetscCall(PetscNew(&upTriFactor)); 824 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 825 826 /* Create the matrix description */ 827 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 828 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 829 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 830 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 831 #else 832 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 833 #endif 834 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 835 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 836 837 /* set the matrix */ 838 upTriFactor->csrMat = new CsrMatrix; 839 upTriFactor->csrMat->num_rows = A->rmap->n; 840 upTriFactor->csrMat->num_cols = A->cmap->n; 841 upTriFactor->csrMat->num_entries = a->nz; 842 843 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 844 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 845 846 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 847 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 848 849 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 850 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 851 852 /* set the operation */ 853 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 854 855 /* Create the solve analysis information */ 856 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 857 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 858 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 859 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 860 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 861 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 862 #endif 863 864 /* perform the solve analysis */ 865 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 866 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 867 868 PetscCallCUDA(WaitForCUDA()); 869 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 870 871 /* assign the pointer */ 872 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 873 874 /* allocate space for the triangular factor information */ 875 PetscCall(PetscNew(&loTriFactor)); 876 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 877 878 /* Create the matrix description */ 879 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 880 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 881 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 882 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 883 #else 884 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 885 #endif 886 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 887 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 888 889 /* set the operation */ 890 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 891 892 /* set the matrix */ 893 loTriFactor->csrMat = new CsrMatrix; 894 loTriFactor->csrMat->num_rows = A->rmap->n; 895 loTriFactor->csrMat->num_cols = A->cmap->n; 896 loTriFactor->csrMat->num_entries = a->nz; 897 898 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 899 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 900 901 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 902 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 903 904 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 905 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 906 907 /* Create the solve analysis information */ 908 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 909 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 910 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 911 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 912 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 913 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 914 #endif 915 916 /* perform the solve analysis */ 917 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 918 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 919 920 PetscCallCUDA(WaitForCUDA()); 921 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 922 923 /* assign the pointer */ 924 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 925 926 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 927 PetscCallCUDA(cudaFreeHost(AiUp)); 928 PetscCallCUDA(cudaFreeHost(AjUp)); 929 } else { 930 /* Fill the upper triangular matrix */ 931 offset = 0; 932 for (i = 0; i < n; i++) { 933 /* set the pointers */ 934 v = aa + ai[i]; 935 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 936 937 /* first, set the diagonal elements */ 938 AAUp[offset] = 1.0 / v[nz]; 939 AALo[offset] = 1.0 / v[nz]; 940 941 offset += 1; 942 if (nz > 0) { 943 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 944 for (j = offset; j < offset + nz; j++) { 945 AAUp[j] = -AAUp[j]; 946 AALo[j] = AAUp[j] / v[nz]; 947 } 948 offset += nz; 949 } 950 } 951 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 952 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 953 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 954 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 955 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 956 } 957 PetscCallCUDA(cudaFreeHost(AAUp)); 958 PetscCallCUDA(cudaFreeHost(AALo)); 959 } catch (char *ex) { 960 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 961 } 962 } 963 PetscFunctionReturn(PETSC_SUCCESS); 964 } 965 #endif 966 967 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 968 { 969 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 970 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 971 IS ip = a->row; 972 PetscBool perm_identity; 973 PetscInt n = A->rmap->n; 974 975 PetscFunctionBegin; 976 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 977 978 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 979 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 980 #else 981 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 982 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 983 #endif 984 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 985 986 A->offloadmask = PETSC_OFFLOAD_BOTH; 987 988 /* lower triangular indices */ 989 PetscCall(ISIdentity(ip, &perm_identity)); 990 if (!perm_identity) { 991 IS iip; 992 const PetscInt *irip, *rip; 993 994 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 995 PetscCall(ISGetIndices(iip, &irip)); 996 PetscCall(ISGetIndices(ip, &rip)); 997 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 998 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 999 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1000 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1001 PetscCall(ISRestoreIndices(iip, &irip)); 1002 PetscCall(ISDestroy(&iip)); 1003 PetscCall(ISRestoreIndices(ip, &rip)); 1004 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1005 } 1006 PetscFunctionReturn(PETSC_SUCCESS); 1007 } 1008 1009 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1010 { 1011 PetscFunctionBegin; 1012 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1013 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1014 B->offloadmask = PETSC_OFFLOAD_CPU; 1015 1016 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1017 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1018 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1019 #else 1020 /* determine which version of MatSolve needs to be used. */ 1021 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1022 IS ip = b->row; 1023 PetscBool perm_identity; 1024 1025 PetscCall(ISIdentity(ip, &perm_identity)); 1026 if (perm_identity) { 1027 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1028 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1029 } else { 1030 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1031 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1032 } 1033 #endif 1034 B->ops->matsolve = NULL; 1035 B->ops->matsolvetranspose = NULL; 1036 1037 /* get the triangular factors */ 1038 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1039 PetscFunctionReturn(PETSC_SUCCESS); 1040 } 1041 1042 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1043 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1044 { 1045 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1046 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1047 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1048 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1049 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1050 cusparseIndexBase_t indexBase; 1051 cusparseMatrixType_t matrixType; 1052 cusparseFillMode_t fillMode; 1053 cusparseDiagType_t diagType; 1054 1055 PetscFunctionBegin; 1056 /* allocate space for the transpose of the lower triangular factor */ 1057 PetscCall(PetscNew(&loTriFactorT)); 1058 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1059 1060 /* set the matrix descriptors of the lower triangular factor */ 1061 matrixType = cusparseGetMatType(loTriFactor->descr); 1062 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1063 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1064 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1065 1066 /* Create the matrix description */ 1067 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1068 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1069 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1070 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1071 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1072 1073 /* set the operation */ 1074 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1075 1076 /* allocate GPU space for the CSC of the lower triangular factor*/ 1077 loTriFactorT->csrMat = new CsrMatrix; 1078 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1079 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1080 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1081 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1082 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1083 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1084 1085 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1086 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1087 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1088 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1089 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1090 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1091 #endif 1092 1093 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1094 { 1095 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1096 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1097 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1098 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1099 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1100 #else 1101 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1102 #endif 1103 PetscCallCUSPARSE(stat); 1104 } 1105 1106 PetscCallCUDA(WaitForCUDA()); 1107 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1108 1109 /* Create the solve analysis information */ 1110 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1111 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1112 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1113 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1114 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1115 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1116 #endif 1117 1118 /* perform the solve analysis */ 1119 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1120 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1121 1122 PetscCallCUDA(WaitForCUDA()); 1123 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1124 1125 /* assign the pointer */ 1126 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1127 1128 /*********************************************/ 1129 /* Now the Transpose of the Upper Tri Factor */ 1130 /*********************************************/ 1131 1132 /* allocate space for the transpose of the upper triangular factor */ 1133 PetscCall(PetscNew(&upTriFactorT)); 1134 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1135 1136 /* set the matrix descriptors of the upper triangular factor */ 1137 matrixType = cusparseGetMatType(upTriFactor->descr); 1138 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1139 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1140 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1141 1142 /* Create the matrix description */ 1143 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1144 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1145 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1146 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1147 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1148 1149 /* set the operation */ 1150 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1151 1152 /* allocate GPU space for the CSC of the upper triangular factor*/ 1153 upTriFactorT->csrMat = new CsrMatrix; 1154 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1155 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1156 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1157 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1158 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1159 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1160 1161 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1162 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1163 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1164 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1165 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1166 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1167 #endif 1168 1169 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1170 { 1171 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1172 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1173 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1174 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1175 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1176 #else 1177 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1178 #endif 1179 PetscCallCUSPARSE(stat); 1180 } 1181 1182 PetscCallCUDA(WaitForCUDA()); 1183 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1184 1185 /* Create the solve analysis information */ 1186 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1187 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1188 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1189 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1190 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1191 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1192 #endif 1193 1194 /* perform the solve analysis */ 1195 /* christ, would it have killed you to put this stuff in a function????????? */ 1196 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1197 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1198 1199 PetscCallCUDA(WaitForCUDA()); 1200 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1201 1202 /* assign the pointer */ 1203 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1204 PetscFunctionReturn(PETSC_SUCCESS); 1205 } 1206 #endif 1207 1208 struct PetscScalarToPetscInt { 1209 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1210 }; 1211 1212 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1213 { 1214 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1215 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1216 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1217 cusparseStatus_t stat; 1218 cusparseIndexBase_t indexBase; 1219 1220 PetscFunctionBegin; 1221 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1222 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1223 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1224 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1225 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1226 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1227 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1228 PetscCall(PetscLogGpuTimeBegin()); 1229 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1230 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1231 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1232 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1233 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1234 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1235 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1236 1237 /* set alpha and beta */ 1238 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1239 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1240 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1241 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1242 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1243 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1244 1245 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1246 CsrMatrix *matrixT = new CsrMatrix; 1247 matstructT->mat = matrixT; 1248 matrixT->num_rows = A->cmap->n; 1249 matrixT->num_cols = A->rmap->n; 1250 matrixT->num_entries = a->nz; 1251 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1252 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1253 matrixT->values = new THRUSTARRAY(a->nz); 1254 1255 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1256 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1257 1258 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1259 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1260 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1261 indexBase, cusparse_scalartype); 1262 PetscCallCUSPARSE(stat); 1263 #else 1264 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1265 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1266 1267 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1268 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1269 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1270 */ 1271 if (matrixT->num_entries) { 1272 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1273 PetscCallCUSPARSE(stat); 1274 1275 } else { 1276 matstructT->matDescr = NULL; 1277 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1278 } 1279 #endif 1280 #endif 1281 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1282 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1283 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1284 #else 1285 CsrMatrix *temp = new CsrMatrix; 1286 CsrMatrix *tempT = new CsrMatrix; 1287 /* First convert HYB to CSR */ 1288 temp->num_rows = A->rmap->n; 1289 temp->num_cols = A->cmap->n; 1290 temp->num_entries = a->nz; 1291 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1292 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1293 temp->values = new THRUSTARRAY(a->nz); 1294 1295 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1296 PetscCallCUSPARSE(stat); 1297 1298 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1299 tempT->num_rows = A->rmap->n; 1300 tempT->num_cols = A->cmap->n; 1301 tempT->num_entries = a->nz; 1302 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1303 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1304 tempT->values = new THRUSTARRAY(a->nz); 1305 1306 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1307 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1308 PetscCallCUSPARSE(stat); 1309 1310 /* Last, convert CSC to HYB */ 1311 cusparseHybMat_t hybMat; 1312 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1313 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1314 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1315 PetscCallCUSPARSE(stat); 1316 1317 /* assign the pointer */ 1318 matstructT->mat = hybMat; 1319 A->transupdated = PETSC_TRUE; 1320 /* delete temporaries */ 1321 if (tempT) { 1322 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1323 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1324 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1325 delete (CsrMatrix *)tempT; 1326 } 1327 if (temp) { 1328 if (temp->values) delete (THRUSTARRAY *)temp->values; 1329 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1330 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1331 delete (CsrMatrix *)temp; 1332 } 1333 #endif 1334 } 1335 } 1336 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1337 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1338 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1339 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1340 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1341 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1342 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1343 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1344 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1345 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1346 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1347 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1348 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1349 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1350 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1351 } 1352 if (!cusparsestruct->csr2csc_i) { 1353 THRUSTARRAY csr2csc_a(matrix->num_entries); 1354 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1355 1356 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1357 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1358 void *csr2cscBuffer; 1359 size_t csr2cscBufferSize; 1360 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1361 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1362 PetscCallCUSPARSE(stat); 1363 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1364 #endif 1365 1366 if (matrix->num_entries) { 1367 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1368 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1369 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1370 1371 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1372 should be filled with indexBase. So I just take a shortcut here. 1373 */ 1374 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1375 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1376 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1377 PetscCallCUSPARSE(stat); 1378 #else 1379 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1380 PetscCallCUSPARSE(stat); 1381 #endif 1382 } else { 1383 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1384 } 1385 1386 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1387 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1388 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1389 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1390 #endif 1391 } 1392 PetscCallThrust( 1393 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1394 } 1395 PetscCall(PetscLogGpuTimeEnd()); 1396 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1397 /* the compressed row indices is not used for matTranspose */ 1398 matstructT->cprowIndices = NULL; 1399 /* assign the pointer */ 1400 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1401 A->transupdated = PETSC_TRUE; 1402 PetscFunctionReturn(PETSC_SUCCESS); 1403 } 1404 1405 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1406 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1407 { 1408 const PetscScalar *barray; 1409 PetscScalar *xarray; 1410 thrust::device_ptr<const PetscScalar> bGPU; 1411 thrust::device_ptr<PetscScalar> xGPU; 1412 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1413 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1414 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1415 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1416 PetscInt m = A->rmap->n; 1417 1418 PetscFunctionBegin; 1419 PetscCall(PetscLogGpuTimeBegin()); 1420 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1421 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1422 xGPU = thrust::device_pointer_cast(xarray); 1423 bGPU = thrust::device_pointer_cast(barray); 1424 1425 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1426 if (fs->rpermIndices) { 1427 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1428 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1429 } else { 1430 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1431 } 1432 1433 // Solve L Y = X 1434 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1435 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1436 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1437 1438 // Solve U X = Y 1439 if (fs->cpermIndices) { 1440 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1441 } else { 1442 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1443 } 1444 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1445 1446 // Reorder X with the column permutation if needed, and put the result back to x 1447 if (fs->cpermIndices) { 1448 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1449 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1450 } 1451 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1452 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1453 PetscCall(PetscLogGpuTimeEnd()); 1454 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1455 PetscFunctionReturn(PETSC_SUCCESS); 1456 } 1457 1458 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1459 { 1460 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1461 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1462 const PetscScalar *barray; 1463 PetscScalar *xarray; 1464 thrust::device_ptr<const PetscScalar> bGPU; 1465 thrust::device_ptr<PetscScalar> xGPU; 1466 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1467 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1468 PetscInt m = A->rmap->n; 1469 1470 PetscFunctionBegin; 1471 PetscCall(PetscLogGpuTimeBegin()); 1472 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1473 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1474 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1475 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1476 1477 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1478 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1479 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1480 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1481 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1482 } 1483 1484 if (!fs->updatedTransposeSpSVAnalysis) { 1485 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1486 1487 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1488 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1489 } 1490 1491 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1492 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1493 xGPU = thrust::device_pointer_cast(xarray); 1494 bGPU = thrust::device_pointer_cast(barray); 1495 1496 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1497 if (fs->rpermIndices) { 1498 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1499 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1500 } else { 1501 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1502 } 1503 1504 // Solve Ut Y = X 1505 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1506 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1507 1508 // Solve Lt X = Y 1509 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1510 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1511 } else { 1512 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1513 } 1514 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1515 1516 // Reorder X with the column permutation if needed, and put the result back to x 1517 if (fs->cpermIndices) { 1518 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1519 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1520 } 1521 1522 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1523 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1524 PetscCall(PetscLogGpuTimeEnd()); 1525 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1526 PetscFunctionReturn(PETSC_SUCCESS); 1527 } 1528 #else 1529 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1530 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1531 { 1532 PetscInt n = xx->map->n; 1533 const PetscScalar *barray; 1534 PetscScalar *xarray; 1535 thrust::device_ptr<const PetscScalar> bGPU; 1536 thrust::device_ptr<PetscScalar> xGPU; 1537 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1538 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1539 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1540 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1541 1542 PetscFunctionBegin; 1543 /* Analyze the matrix and create the transpose ... on the fly */ 1544 if (!loTriFactorT && !upTriFactorT) { 1545 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1546 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1547 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1548 } 1549 1550 /* Get the GPU pointers */ 1551 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1552 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1553 xGPU = thrust::device_pointer_cast(xarray); 1554 bGPU = thrust::device_pointer_cast(barray); 1555 1556 PetscCall(PetscLogGpuTimeBegin()); 1557 /* First, reorder with the row permutation */ 1558 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1559 1560 /* First, solve U */ 1561 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1562 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1563 1564 /* Then, solve L */ 1565 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1566 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1567 1568 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1569 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1570 1571 /* Copy the temporary to the full solution. */ 1572 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1573 1574 /* restore */ 1575 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1576 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1577 PetscCall(PetscLogGpuTimeEnd()); 1578 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1579 PetscFunctionReturn(PETSC_SUCCESS); 1580 } 1581 1582 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1583 { 1584 const PetscScalar *barray; 1585 PetscScalar *xarray; 1586 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1587 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1588 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1589 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1590 1591 PetscFunctionBegin; 1592 /* Analyze the matrix and create the transpose ... on the fly */ 1593 if (!loTriFactorT && !upTriFactorT) { 1594 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1595 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1596 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1597 } 1598 1599 /* Get the GPU pointers */ 1600 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1601 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1602 1603 PetscCall(PetscLogGpuTimeBegin()); 1604 /* First, solve U */ 1605 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1606 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1607 1608 /* Then, solve L */ 1609 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1610 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1611 1612 /* restore */ 1613 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1614 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1615 PetscCall(PetscLogGpuTimeEnd()); 1616 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1617 PetscFunctionReturn(PETSC_SUCCESS); 1618 } 1619 1620 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1621 { 1622 const PetscScalar *barray; 1623 PetscScalar *xarray; 1624 thrust::device_ptr<const PetscScalar> bGPU; 1625 thrust::device_ptr<PetscScalar> xGPU; 1626 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1627 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1628 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1629 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1630 1631 PetscFunctionBegin; 1632 /* Get the GPU pointers */ 1633 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1634 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1635 xGPU = thrust::device_pointer_cast(xarray); 1636 bGPU = thrust::device_pointer_cast(barray); 1637 1638 PetscCall(PetscLogGpuTimeBegin()); 1639 /* First, reorder with the row permutation */ 1640 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1641 1642 /* Next, solve L */ 1643 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1644 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1645 1646 /* Then, solve U */ 1647 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1648 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1649 1650 /* Last, reorder with the column permutation */ 1651 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1652 1653 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1654 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1655 PetscCall(PetscLogGpuTimeEnd()); 1656 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1657 PetscFunctionReturn(PETSC_SUCCESS); 1658 } 1659 1660 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1661 { 1662 const PetscScalar *barray; 1663 PetscScalar *xarray; 1664 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1665 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1666 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1667 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1668 1669 PetscFunctionBegin; 1670 /* Get the GPU pointers */ 1671 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1672 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1673 1674 PetscCall(PetscLogGpuTimeBegin()); 1675 /* First, solve L */ 1676 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1677 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1678 1679 /* Next, solve U */ 1680 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1681 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1682 1683 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1684 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1685 PetscCall(PetscLogGpuTimeEnd()); 1686 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1687 PetscFunctionReturn(PETSC_SUCCESS); 1688 } 1689 #endif 1690 1691 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1692 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1693 { 1694 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1695 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1696 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1697 CsrMatrix *Acsr; 1698 PetscInt m, nz; 1699 PetscBool flg; 1700 1701 PetscFunctionBegin; 1702 if (PetscDefined(USE_DEBUG)) { 1703 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1704 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1705 } 1706 1707 /* Copy A's value to fact */ 1708 m = fact->rmap->n; 1709 nz = aij->nz; 1710 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1711 Acsr = (CsrMatrix *)Acusp->mat->mat; 1712 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1713 1714 PetscCall(PetscLogGpuTimeBegin()); 1715 /* Factorize fact inplace */ 1716 if (m) 1717 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1718 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1719 if (PetscDefined(USE_DEBUG)) { 1720 int numerical_zero; 1721 cusparseStatus_t status; 1722 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1723 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1724 } 1725 1726 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1727 if (fs->updatedSpSVAnalysis) { 1728 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1729 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1730 } else 1731 #endif 1732 { 1733 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1734 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1735 */ 1736 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1737 1738 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1739 1740 fs->updatedSpSVAnalysis = PETSC_TRUE; 1741 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1742 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1743 } 1744 1745 fact->offloadmask = PETSC_OFFLOAD_GPU; 1746 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1747 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1748 fact->ops->matsolve = NULL; 1749 fact->ops->matsolvetranspose = NULL; 1750 PetscCall(PetscLogGpuTimeEnd()); 1751 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1752 PetscFunctionReturn(PETSC_SUCCESS); 1753 } 1754 1755 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1756 { 1757 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1758 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1759 PetscInt m, nz; 1760 1761 PetscFunctionBegin; 1762 if (PetscDefined(USE_DEBUG)) { 1763 PetscBool flg, diagDense; 1764 1765 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1766 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1767 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1768 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense)); 1769 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry"); 1770 } 1771 1772 /* Free the old stale stuff */ 1773 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1774 1775 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1776 but they will not be used. Allocate them just for easy debugging. 1777 */ 1778 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1779 1780 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1781 fact->factortype = MAT_FACTOR_ILU; 1782 fact->info.factor_mallocs = 0; 1783 fact->info.fill_ratio_given = info->fill; 1784 fact->info.fill_ratio_needed = 1.0; 1785 1786 aij->row = NULL; 1787 aij->col = NULL; 1788 1789 /* ====================================================================== */ 1790 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1791 /* We'll do in-place factorization on fact */ 1792 /* ====================================================================== */ 1793 const int *Ai, *Aj; 1794 1795 m = fact->rmap->n; 1796 nz = aij->nz; 1797 1798 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1799 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1800 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1801 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1802 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1803 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1804 1805 /* ====================================================================== */ 1806 /* Create descriptors for M, L, U */ 1807 /* ====================================================================== */ 1808 cusparseFillMode_t fillMode; 1809 cusparseDiagType_t diagType; 1810 1811 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1812 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1813 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1814 1815 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1816 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1817 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1818 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1819 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1820 */ 1821 fillMode = CUSPARSE_FILL_MODE_LOWER; 1822 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1823 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1824 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1825 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1826 1827 fillMode = CUSPARSE_FILL_MODE_UPPER; 1828 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1829 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1830 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1831 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1832 1833 /* ========================================================================= */ 1834 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1835 /* ========================================================================= */ 1836 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1837 if (m) 1838 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1839 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1840 1841 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1842 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1843 1844 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1845 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1846 1847 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1848 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1849 1850 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1851 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1852 1853 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1854 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1855 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1856 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1857 */ 1858 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1859 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1860 fs->spsvBuffer_L = fs->factBuffer_M; 1861 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1862 } else { 1863 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1864 fs->spsvBuffer_U = fs->factBuffer_M; 1865 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1866 } 1867 1868 /* ========================================================================== */ 1869 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1870 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1871 /* ========================================================================== */ 1872 int structural_zero; 1873 cusparseStatus_t status; 1874 1875 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1876 if (m) 1877 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1878 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1879 if (PetscDefined(USE_DEBUG)) { 1880 /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1881 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1882 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1883 } 1884 1885 /* Estimate FLOPs of the numeric factorization */ 1886 { 1887 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1888 PetscInt *Ai, nzRow, nzLeft; 1889 const PetscInt *adiag; 1890 PetscLogDouble flops = 0.0; 1891 1892 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL)); 1893 Ai = Aseq->i; 1894 for (PetscInt i = 0; i < m; i++) { 1895 if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1896 nzRow = Ai[i + 1] - Ai[i]; 1897 nzLeft = adiag[i] - Ai[i]; 1898 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1899 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1900 */ 1901 nzLeft = (nzRow - 1) / 2; 1902 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1903 } 1904 } 1905 fs->numericFactFlops = flops; 1906 } 1907 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1908 PetscFunctionReturn(PETSC_SUCCESS); 1909 } 1910 1911 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1912 { 1913 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1914 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1915 const PetscScalar *barray; 1916 PetscScalar *xarray; 1917 1918 PetscFunctionBegin; 1919 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1920 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1921 PetscCall(PetscLogGpuTimeBegin()); 1922 1923 /* Solve L*y = b */ 1924 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1925 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1926 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1927 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1928 1929 /* Solve Lt*x = y */ 1930 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1931 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1932 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1933 1934 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1935 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1936 1937 PetscCall(PetscLogGpuTimeEnd()); 1938 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1939 PetscFunctionReturn(PETSC_SUCCESS); 1940 } 1941 1942 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1943 { 1944 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1945 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1946 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1947 CsrMatrix *Acsr; 1948 PetscInt m, nz; 1949 PetscBool flg; 1950 1951 PetscFunctionBegin; 1952 if (PetscDefined(USE_DEBUG)) { 1953 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1954 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1955 } 1956 1957 /* Copy A's value to fact */ 1958 m = fact->rmap->n; 1959 nz = aij->nz; 1960 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1961 Acsr = (CsrMatrix *)Acusp->mat->mat; 1962 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1963 1964 /* Factorize fact inplace */ 1965 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1966 csric02() only takes the lower triangular part of matrix A to perform factorization. 1967 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1968 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1969 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1970 */ 1971 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1972 if (PetscDefined(USE_DEBUG)) { 1973 int numerical_zero; 1974 cusparseStatus_t status; 1975 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1976 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1977 } 1978 1979 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1980 if (fs->updatedSpSVAnalysis) { 1981 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1982 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1983 } else 1984 #endif 1985 { 1986 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1987 1988 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1989 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1990 */ 1991 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1992 fs->updatedSpSVAnalysis = PETSC_TRUE; 1993 } 1994 1995 fact->offloadmask = PETSC_OFFLOAD_GPU; 1996 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1997 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1998 fact->ops->matsolve = NULL; 1999 fact->ops->matsolvetranspose = NULL; 2000 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2001 PetscFunctionReturn(PETSC_SUCCESS); 2002 } 2003 2004 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2005 { 2006 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2007 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2008 PetscInt m, nz; 2009 2010 PetscFunctionBegin; 2011 if (PetscDefined(USE_DEBUG)) { 2012 PetscBool flg, diagDense; 2013 2014 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2015 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2016 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2017 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense)); 2018 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries"); 2019 } 2020 2021 /* Free the old stale stuff */ 2022 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2023 2024 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2025 but they will not be used. Allocate them just for easy debugging. 2026 */ 2027 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2028 2029 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2030 fact->factortype = MAT_FACTOR_ICC; 2031 fact->info.factor_mallocs = 0; 2032 fact->info.fill_ratio_given = info->fill; 2033 fact->info.fill_ratio_needed = 1.0; 2034 2035 aij->row = NULL; 2036 aij->col = NULL; 2037 2038 /* ====================================================================== */ 2039 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2040 /* We'll do in-place factorization on fact */ 2041 /* ====================================================================== */ 2042 const int *Ai, *Aj; 2043 2044 m = fact->rmap->n; 2045 nz = aij->nz; 2046 2047 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2048 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2049 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2050 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2051 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2052 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2053 2054 /* ====================================================================== */ 2055 /* Create mat descriptors for M, L */ 2056 /* ====================================================================== */ 2057 cusparseFillMode_t fillMode; 2058 cusparseDiagType_t diagType; 2059 2060 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2061 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2062 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2063 2064 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2065 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2066 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2067 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2068 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2069 */ 2070 fillMode = CUSPARSE_FILL_MODE_LOWER; 2071 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2072 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2073 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2074 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2075 2076 /* ========================================================================= */ 2077 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2078 /* ========================================================================= */ 2079 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2080 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2081 2082 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2083 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2084 2085 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2086 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2087 2088 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2089 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2090 2091 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2092 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2093 2094 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2095 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2096 */ 2097 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2098 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2099 fs->spsvBuffer_L = fs->factBuffer_M; 2100 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2101 } else { 2102 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2103 fs->spsvBuffer_Lt = fs->factBuffer_M; 2104 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2105 } 2106 2107 /* ========================================================================== */ 2108 /* Perform analysis of ic0 on M */ 2109 /* The lower triangular part of M has the same sparsity pattern as L */ 2110 /* ========================================================================== */ 2111 int structural_zero; 2112 cusparseStatus_t status; 2113 2114 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2115 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2116 if (PetscDefined(USE_DEBUG)) { 2117 /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2118 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2119 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2120 } 2121 2122 /* Estimate FLOPs of the numeric factorization */ 2123 { 2124 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2125 PetscInt *Ai, nzRow, nzLeft; 2126 PetscLogDouble flops = 0.0; 2127 2128 Ai = Aseq->i; 2129 for (PetscInt i = 0; i < m; i++) { 2130 nzRow = Ai[i + 1] - Ai[i]; 2131 if (nzRow > 1) { 2132 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2133 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2134 */ 2135 nzLeft = (nzRow - 1) / 2; 2136 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2137 } 2138 } 2139 fs->numericFactFlops = flops; 2140 } 2141 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2142 PetscFunctionReturn(PETSC_SUCCESS); 2143 } 2144 #endif 2145 2146 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2147 { 2148 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2149 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2150 2151 PetscFunctionBegin; 2152 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2153 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2154 B->offloadmask = PETSC_OFFLOAD_CPU; 2155 2156 if (!cusparsestruct->use_cpu_solve) { 2157 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2158 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2159 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2160 #else 2161 /* determine which version of MatSolve needs to be used. */ 2162 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2163 IS isrow = b->row, iscol = b->col; 2164 PetscBool row_identity, col_identity; 2165 2166 PetscCall(ISIdentity(isrow, &row_identity)); 2167 PetscCall(ISIdentity(iscol, &col_identity)); 2168 if (row_identity && col_identity) { 2169 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2170 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2171 } else { 2172 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2173 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2174 } 2175 #endif 2176 } 2177 B->ops->matsolve = NULL; 2178 B->ops->matsolvetranspose = NULL; 2179 2180 /* get the triangular factors */ 2181 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2182 PetscFunctionReturn(PETSC_SUCCESS); 2183 } 2184 2185 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2186 { 2187 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2188 2189 PetscFunctionBegin; 2190 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2191 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2192 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2193 PetscFunctionReturn(PETSC_SUCCESS); 2194 } 2195 2196 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2197 { 2198 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2199 2200 PetscFunctionBegin; 2201 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2202 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2203 if (!info->factoronhost) { 2204 PetscCall(ISIdentity(isrow, &row_identity)); 2205 PetscCall(ISIdentity(iscol, &col_identity)); 2206 } 2207 if (!info->levels && row_identity && col_identity) { 2208 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2209 } else 2210 #endif 2211 { 2212 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2213 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2214 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2215 } 2216 PetscFunctionReturn(PETSC_SUCCESS); 2217 } 2218 2219 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2220 { 2221 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2222 2223 PetscFunctionBegin; 2224 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2225 PetscBool perm_identity = PETSC_FALSE; 2226 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2227 if (!info->levels && perm_identity) { 2228 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2229 } else 2230 #endif 2231 { 2232 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2233 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2234 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2235 } 2236 PetscFunctionReturn(PETSC_SUCCESS); 2237 } 2238 2239 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2240 { 2241 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2242 2243 PetscFunctionBegin; 2244 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2245 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2246 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2247 PetscFunctionReturn(PETSC_SUCCESS); 2248 } 2249 2250 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2251 { 2252 PetscFunctionBegin; 2253 *type = MATSOLVERCUSPARSE; 2254 PetscFunctionReturn(PETSC_SUCCESS); 2255 } 2256 2257 /*MC 2258 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2259 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2260 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2261 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2262 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2263 algorithms are not recommended. This class does NOT support direct solver operations. 2264 2265 Level: beginner 2266 2267 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2268 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2269 M*/ 2270 2271 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2272 { 2273 PetscInt n = A->rmap->n; 2274 2275 PetscFunctionBegin; 2276 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2277 PetscCall(MatSetSizes(*B, n, n, n, n)); 2278 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2279 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2280 2281 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2282 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2283 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2284 if (!A->boundtocpu) { 2285 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2286 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2287 } else { 2288 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2289 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2290 } 2291 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2292 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2293 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2294 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2295 if (!A->boundtocpu) { 2296 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2297 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2298 } else { 2299 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2300 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2301 } 2302 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2303 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2304 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2305 2306 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2307 (*B)->canuseordering = PETSC_TRUE; 2308 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2309 PetscFunctionReturn(PETSC_SUCCESS); 2310 } 2311 2312 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2313 { 2314 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2315 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2316 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2317 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2318 #endif 2319 2320 PetscFunctionBegin; 2321 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2322 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2323 if (A->factortype == MAT_FACTOR_NONE) { 2324 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2325 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2326 } 2327 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2328 else if (fs->csrVal) { 2329 /* We have a factorized matrix on device and are able to copy it to host */ 2330 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2331 } 2332 #endif 2333 else 2334 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2335 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2336 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2337 A->offloadmask = PETSC_OFFLOAD_BOTH; 2338 } 2339 PetscFunctionReturn(PETSC_SUCCESS); 2340 } 2341 2342 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2343 { 2344 PetscFunctionBegin; 2345 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2346 *array = ((Mat_SeqAIJ *)A->data)->a; 2347 PetscFunctionReturn(PETSC_SUCCESS); 2348 } 2349 2350 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2351 { 2352 PetscFunctionBegin; 2353 A->offloadmask = PETSC_OFFLOAD_CPU; 2354 *array = NULL; 2355 PetscFunctionReturn(PETSC_SUCCESS); 2356 } 2357 2358 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2359 { 2360 PetscFunctionBegin; 2361 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2362 *array = ((Mat_SeqAIJ *)A->data)->a; 2363 PetscFunctionReturn(PETSC_SUCCESS); 2364 } 2365 2366 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2367 { 2368 PetscFunctionBegin; 2369 *array = NULL; 2370 PetscFunctionReturn(PETSC_SUCCESS); 2371 } 2372 2373 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2374 { 2375 PetscFunctionBegin; 2376 *array = ((Mat_SeqAIJ *)A->data)->a; 2377 PetscFunctionReturn(PETSC_SUCCESS); 2378 } 2379 2380 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2381 { 2382 PetscFunctionBegin; 2383 A->offloadmask = PETSC_OFFLOAD_CPU; 2384 *array = NULL; 2385 PetscFunctionReturn(PETSC_SUCCESS); 2386 } 2387 2388 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2389 { 2390 Mat_SeqAIJCUSPARSE *cusp; 2391 CsrMatrix *matrix; 2392 2393 PetscFunctionBegin; 2394 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2395 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2396 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2397 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2398 matrix = (CsrMatrix *)cusp->mat->mat; 2399 2400 if (i) { 2401 #if !defined(PETSC_USE_64BIT_INDICES) 2402 *i = matrix->row_offsets->data().get(); 2403 #else 2404 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2405 #endif 2406 } 2407 if (j) { 2408 #if !defined(PETSC_USE_64BIT_INDICES) 2409 *j = matrix->column_indices->data().get(); 2410 #else 2411 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2412 #endif 2413 } 2414 if (a) *a = matrix->values->data().get(); 2415 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2416 PetscFunctionReturn(PETSC_SUCCESS); 2417 } 2418 2419 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2420 { 2421 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2422 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2423 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2424 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2425 cusparseStatus_t stat; 2426 PetscBool both = PETSC_TRUE; 2427 2428 PetscFunctionBegin; 2429 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2430 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2431 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2432 CsrMatrix *matrix; 2433 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2434 2435 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2436 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2437 matrix->values->assign(a->a, a->a + a->nz); 2438 PetscCallCUDA(WaitForCUDA()); 2439 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2440 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2441 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2442 } else { 2443 PetscInt nnz; 2444 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2445 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2446 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2447 delete cusparsestruct->workVector; 2448 delete cusparsestruct->rowoffsets_gpu; 2449 cusparsestruct->workVector = NULL; 2450 cusparsestruct->rowoffsets_gpu = NULL; 2451 try { 2452 if (a->compressedrow.use) { 2453 m = a->compressedrow.nrows; 2454 ii = a->compressedrow.i; 2455 ridx = a->compressedrow.rindex; 2456 } else { 2457 m = A->rmap->n; 2458 ii = a->i; 2459 ridx = NULL; 2460 } 2461 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2462 if (!a->a) { 2463 nnz = ii[m]; 2464 both = PETSC_FALSE; 2465 } else nnz = a->nz; 2466 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2467 2468 /* create cusparse matrix */ 2469 cusparsestruct->nrows = m; 2470 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2471 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2472 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2473 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2474 2475 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2476 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2477 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2478 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2479 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2480 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2481 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2482 2483 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2484 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2485 /* set the matrix */ 2486 CsrMatrix *mat = new CsrMatrix; 2487 mat->num_rows = m; 2488 mat->num_cols = A->cmap->n; 2489 mat->num_entries = nnz; 2490 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2491 mat->row_offsets->assign(ii, ii + m + 1); 2492 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2493 mat->column_indices->assign(a->j, a->j + nnz); 2494 2495 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2496 if (a->a) mat->values->assign(a->a, a->a + nnz); 2497 2498 /* assign the pointer */ 2499 matstruct->mat = mat; 2500 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2501 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2502 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2503 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2504 PetscCallCUSPARSE(stat); 2505 } 2506 #endif 2507 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2508 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2509 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2510 #else 2511 CsrMatrix *mat = new CsrMatrix; 2512 mat->num_rows = m; 2513 mat->num_cols = A->cmap->n; 2514 mat->num_entries = nnz; 2515 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2516 mat->row_offsets->assign(ii, ii + m + 1); 2517 2518 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2519 mat->column_indices->assign(a->j, a->j + nnz); 2520 2521 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2522 if (a->a) mat->values->assign(a->a, a->a + nnz); 2523 2524 cusparseHybMat_t hybMat; 2525 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2526 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2527 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2528 PetscCallCUSPARSE(stat); 2529 /* assign the pointer */ 2530 matstruct->mat = hybMat; 2531 2532 if (mat) { 2533 if (mat->values) delete (THRUSTARRAY *)mat->values; 2534 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2535 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2536 delete (CsrMatrix *)mat; 2537 } 2538 #endif 2539 } 2540 2541 /* assign the compressed row indices */ 2542 if (a->compressedrow.use) { 2543 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2544 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2545 matstruct->cprowIndices->assign(ridx, ridx + m); 2546 tmp = m; 2547 } else { 2548 cusparsestruct->workVector = NULL; 2549 matstruct->cprowIndices = NULL; 2550 tmp = 0; 2551 } 2552 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2553 2554 /* assign the pointer */ 2555 cusparsestruct->mat = matstruct; 2556 } catch (char *ex) { 2557 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2558 } 2559 PetscCallCUDA(WaitForCUDA()); 2560 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2561 cusparsestruct->nonzerostate = A->nonzerostate; 2562 } 2563 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2564 } 2565 PetscFunctionReturn(PETSC_SUCCESS); 2566 } 2567 2568 struct VecCUDAPlusEquals { 2569 template <typename Tuple> 2570 __host__ __device__ void operator()(Tuple t) 2571 { 2572 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2573 } 2574 }; 2575 2576 struct VecCUDAEquals { 2577 template <typename Tuple> 2578 __host__ __device__ void operator()(Tuple t) 2579 { 2580 thrust::get<1>(t) = thrust::get<0>(t); 2581 } 2582 }; 2583 2584 struct VecCUDAEqualsReverse { 2585 template <typename Tuple> 2586 __host__ __device__ void operator()(Tuple t) 2587 { 2588 thrust::get<0>(t) = thrust::get<1>(t); 2589 } 2590 }; 2591 2592 struct MatProductCtx_MatMatCusparse { 2593 PetscBool cisdense; 2594 PetscScalar *Bt; 2595 Mat X; 2596 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2597 PetscLogDouble flops; 2598 CsrMatrix *Bcsr; 2599 2600 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2601 cusparseSpMatDescr_t matSpBDescr; 2602 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2603 cusparseDnMatDescr_t matBDescr; 2604 cusparseDnMatDescr_t matCDescr; 2605 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2606 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2607 void *dBuffer4; 2608 void *dBuffer5; 2609 #endif 2610 size_t mmBufferSize; 2611 void *mmBuffer; 2612 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2613 cusparseSpGEMMDescr_t spgemmDesc; 2614 #endif 2615 }; 2616 2617 static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(void **data) 2618 { 2619 MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data; 2620 2621 PetscFunctionBegin; 2622 PetscCallCUDA(cudaFree(mmdata->Bt)); 2623 delete mmdata->Bcsr; 2624 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2625 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2626 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2627 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2628 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2629 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2630 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2631 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2632 #endif 2633 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2634 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2635 #endif 2636 PetscCall(MatDestroy(&mmdata->X)); 2637 PetscCall(PetscFree(*data)); 2638 PetscFunctionReturn(PETSC_SUCCESS); 2639 } 2640 2641 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2642 2643 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2644 { 2645 Mat_Product *product = C->product; 2646 Mat A, B; 2647 PetscInt m, n, blda, clda; 2648 PetscBool flg, biscuda; 2649 Mat_SeqAIJCUSPARSE *cusp; 2650 cusparseStatus_t stat; 2651 cusparseOperation_t opA; 2652 const PetscScalar *barray; 2653 PetscScalar *carray; 2654 MatProductCtx_MatMatCusparse *mmdata; 2655 Mat_SeqAIJCUSPARSEMultStruct *mat; 2656 CsrMatrix *csrmat; 2657 2658 PetscFunctionBegin; 2659 MatCheckProduct(C, 1); 2660 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2661 mmdata = (MatProductCtx_MatMatCusparse *)product->data; 2662 A = product->A; 2663 B = product->B; 2664 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2665 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2666 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2667 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2668 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2669 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2670 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2671 switch (product->type) { 2672 case MATPRODUCT_AB: 2673 case MATPRODUCT_PtAP: 2674 mat = cusp->mat; 2675 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2676 m = A->rmap->n; 2677 n = B->cmap->n; 2678 break; 2679 case MATPRODUCT_AtB: 2680 if (!A->form_explicit_transpose) { 2681 mat = cusp->mat; 2682 opA = CUSPARSE_OPERATION_TRANSPOSE; 2683 } else { 2684 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2685 mat = cusp->matTranspose; 2686 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2687 } 2688 m = A->cmap->n; 2689 n = B->cmap->n; 2690 break; 2691 case MATPRODUCT_ABt: 2692 case MATPRODUCT_RARt: 2693 mat = cusp->mat; 2694 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2695 m = A->rmap->n; 2696 n = B->rmap->n; 2697 break; 2698 default: 2699 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2700 } 2701 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2702 csrmat = (CsrMatrix *)mat->mat; 2703 /* if the user passed a CPU matrix, copy the data to the GPU */ 2704 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2705 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2706 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2707 2708 PetscCall(MatDenseGetLDA(B, &blda)); 2709 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2710 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2711 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2712 } else { 2713 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2714 PetscCall(MatDenseGetLDA(C, &clda)); 2715 } 2716 2717 PetscCall(PetscLogGpuTimeBegin()); 2718 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2719 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2720 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2721 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2722 #else 2723 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2724 #endif 2725 2726 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2727 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2728 size_t mmBufferSize; 2729 if (mmdata->initialized && mmdata->Blda != blda) { 2730 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2731 mmdata->matBDescr = NULL; 2732 } 2733 if (!mmdata->matBDescr) { 2734 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2735 mmdata->Blda = blda; 2736 } 2737 2738 if (mmdata->initialized && mmdata->Clda != clda) { 2739 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2740 mmdata->matCDescr = NULL; 2741 } 2742 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2743 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2744 mmdata->Clda = clda; 2745 } 2746 2747 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2748 if (matADescr) { 2749 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2750 matADescr = NULL; 2751 } 2752 #endif 2753 2754 if (!matADescr) { 2755 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2756 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2757 PetscCallCUSPARSE(stat); 2758 } 2759 2760 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2761 2762 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2763 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2764 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2765 mmdata->mmBufferSize = mmBufferSize; 2766 } 2767 2768 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0 2769 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2770 #endif 2771 2772 mmdata->initialized = PETSC_TRUE; 2773 } else { 2774 /* to be safe, always update pointers of the mats */ 2775 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2776 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2777 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2778 } 2779 2780 /* do cusparseSpMM, which supports transpose on B */ 2781 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2782 #else 2783 PetscInt k; 2784 /* cusparseXcsrmm does not support transpose on B */ 2785 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2786 cublasHandle_t cublasv2handle; 2787 cublasStatus_t cerr; 2788 2789 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2790 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2791 PetscCallCUBLAS(cerr); 2792 blda = B->cmap->n; 2793 k = B->cmap->n; 2794 } else { 2795 k = B->rmap->n; 2796 } 2797 2798 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2799 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2800 PetscCallCUSPARSE(stat); 2801 #endif 2802 PetscCall(PetscLogGpuTimeEnd()); 2803 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2804 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2805 if (product->type == MATPRODUCT_RARt) { 2806 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2807 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2808 } else if (product->type == MATPRODUCT_PtAP) { 2809 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2810 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2811 } else { 2812 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2813 } 2814 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2815 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2816 PetscFunctionReturn(PETSC_SUCCESS); 2817 } 2818 2819 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2820 { 2821 Mat_Product *product = C->product; 2822 Mat A, B; 2823 PetscInt m, n; 2824 PetscBool cisdense, flg; 2825 MatProductCtx_MatMatCusparse *mmdata; 2826 Mat_SeqAIJCUSPARSE *cusp; 2827 2828 PetscFunctionBegin; 2829 MatCheckProduct(C, 1); 2830 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2831 A = product->A; 2832 B = product->B; 2833 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2834 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2835 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2836 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2837 switch (product->type) { 2838 case MATPRODUCT_AB: 2839 m = A->rmap->n; 2840 n = B->cmap->n; 2841 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2842 break; 2843 case MATPRODUCT_AtB: 2844 m = A->cmap->n; 2845 n = B->cmap->n; 2846 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 2847 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2848 break; 2849 case MATPRODUCT_ABt: 2850 m = A->rmap->n; 2851 n = B->rmap->n; 2852 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 2853 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2854 break; 2855 case MATPRODUCT_PtAP: 2856 m = B->cmap->n; 2857 n = B->cmap->n; 2858 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 2859 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2860 break; 2861 case MATPRODUCT_RARt: 2862 m = B->rmap->n; 2863 n = B->rmap->n; 2864 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 2865 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2866 break; 2867 default: 2868 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2869 } 2870 PetscCall(MatSetSizes(C, m, n, m, n)); 2871 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2872 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2873 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2874 2875 /* product data */ 2876 PetscCall(PetscNew(&mmdata)); 2877 mmdata->cisdense = cisdense; 2878 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2879 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2880 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2881 #endif 2882 /* for these products we need intermediate storage */ 2883 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2884 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2885 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2886 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2887 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2888 } else { 2889 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2890 } 2891 } 2892 C->product->data = mmdata; 2893 C->product->destroy = MatProductCtxDestroy_MatMatCusparse; 2894 2895 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2896 PetscFunctionReturn(PETSC_SUCCESS); 2897 } 2898 2899 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2900 { 2901 Mat_Product *product = C->product; 2902 Mat A, B; 2903 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2904 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2905 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2906 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2907 PetscBool flg; 2908 cusparseStatus_t stat; 2909 MatProductType ptype; 2910 MatProductCtx_MatMatCusparse *mmdata; 2911 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2912 cusparseSpMatDescr_t BmatSpDescr; 2913 #endif 2914 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2915 2916 PetscFunctionBegin; 2917 MatCheckProduct(C, 1); 2918 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2919 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2920 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2921 mmdata = (MatProductCtx_MatMatCusparse *)C->product->data; 2922 A = product->A; 2923 B = product->B; 2924 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2925 mmdata->reusesym = PETSC_FALSE; 2926 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2927 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2928 Cmat = Ccusp->mat; 2929 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2930 Ccsr = (CsrMatrix *)Cmat->mat; 2931 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2932 goto finalize; 2933 } 2934 if (!c->nz) goto finalize; 2935 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2936 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2937 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2938 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2939 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2940 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2941 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2942 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2943 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2944 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2945 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2946 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2947 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2948 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2949 2950 ptype = product->type; 2951 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2952 ptype = MATPRODUCT_AB; 2953 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2954 } 2955 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2956 ptype = MATPRODUCT_AB; 2957 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2958 } 2959 switch (ptype) { 2960 case MATPRODUCT_AB: 2961 Amat = Acusp->mat; 2962 Bmat = Bcusp->mat; 2963 break; 2964 case MATPRODUCT_AtB: 2965 Amat = Acusp->matTranspose; 2966 Bmat = Bcusp->mat; 2967 break; 2968 case MATPRODUCT_ABt: 2969 Amat = Acusp->mat; 2970 Bmat = Bcusp->matTranspose; 2971 break; 2972 default: 2973 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2974 } 2975 Cmat = Ccusp->mat; 2976 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2977 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2978 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2979 Acsr = (CsrMatrix *)Amat->mat; 2980 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2981 Ccsr = (CsrMatrix *)Cmat->mat; 2982 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2983 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2984 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2985 PetscCall(PetscLogGpuTimeBegin()); 2986 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2987 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2988 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2989 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2990 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2991 PetscCallCUSPARSE(stat); 2992 #else 2993 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2994 PetscCallCUSPARSE(stat); 2995 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2996 PetscCallCUSPARSE(stat); 2997 #endif 2998 #else 2999 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3000 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3001 PetscCallCUSPARSE(stat); 3002 #endif 3003 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3004 PetscCallCUDA(WaitForCUDA()); 3005 PetscCall(PetscLogGpuTimeEnd()); 3006 C->offloadmask = PETSC_OFFLOAD_GPU; 3007 finalize: 3008 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3009 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3010 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3011 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3012 c->reallocs = 0; 3013 C->info.mallocs += 0; 3014 C->info.nz_unneeded = 0; 3015 C->assembled = C->was_assembled = PETSC_TRUE; 3016 C->num_ass++; 3017 PetscFunctionReturn(PETSC_SUCCESS); 3018 } 3019 3020 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3021 { 3022 Mat_Product *product = C->product; 3023 Mat A, B; 3024 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3025 Mat_SeqAIJ *a, *b, *c; 3026 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3027 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3028 PetscInt i, j, m, n, k; 3029 PetscBool flg; 3030 cusparseStatus_t stat; 3031 MatProductType ptype; 3032 MatProductCtx_MatMatCusparse *mmdata; 3033 PetscLogDouble flops; 3034 PetscBool biscompressed, ciscompressed; 3035 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3036 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3037 cusparseSpMatDescr_t BmatSpDescr; 3038 #else 3039 int cnz; 3040 #endif 3041 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3042 3043 PetscFunctionBegin; 3044 MatCheckProduct(C, 1); 3045 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3046 A = product->A; 3047 B = product->B; 3048 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3049 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3050 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3051 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3052 a = (Mat_SeqAIJ *)A->data; 3053 b = (Mat_SeqAIJ *)B->data; 3054 /* product data */ 3055 PetscCall(PetscNew(&mmdata)); 3056 C->product->data = mmdata; 3057 C->product->destroy = MatProductCtxDestroy_MatMatCusparse; 3058 3059 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3060 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3061 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3062 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3063 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3064 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3065 3066 ptype = product->type; 3067 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3068 ptype = MATPRODUCT_AB; 3069 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3070 } 3071 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3072 ptype = MATPRODUCT_AB; 3073 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3074 } 3075 biscompressed = PETSC_FALSE; 3076 ciscompressed = PETSC_FALSE; 3077 switch (ptype) { 3078 case MATPRODUCT_AB: 3079 m = A->rmap->n; 3080 n = B->cmap->n; 3081 k = A->cmap->n; 3082 Amat = Acusp->mat; 3083 Bmat = Bcusp->mat; 3084 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3085 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3086 break; 3087 case MATPRODUCT_AtB: 3088 m = A->cmap->n; 3089 n = B->cmap->n; 3090 k = A->rmap->n; 3091 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3092 Amat = Acusp->matTranspose; 3093 Bmat = Bcusp->mat; 3094 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3095 break; 3096 case MATPRODUCT_ABt: 3097 m = A->rmap->n; 3098 n = B->rmap->n; 3099 k = A->cmap->n; 3100 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3101 Amat = Acusp->mat; 3102 Bmat = Bcusp->matTranspose; 3103 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3104 break; 3105 default: 3106 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3107 } 3108 3109 /* create cusparse matrix */ 3110 PetscCall(MatSetSizes(C, m, n, m, n)); 3111 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3112 c = (Mat_SeqAIJ *)C->data; 3113 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3114 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3115 Ccsr = new CsrMatrix; 3116 3117 c->compressedrow.use = ciscompressed; 3118 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3119 c->compressedrow.nrows = a->compressedrow.nrows; 3120 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3121 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3122 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3123 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3124 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3125 } else { 3126 c->compressedrow.nrows = 0; 3127 c->compressedrow.i = NULL; 3128 c->compressedrow.rindex = NULL; 3129 Ccusp->workVector = NULL; 3130 Cmat->cprowIndices = NULL; 3131 } 3132 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3133 Ccusp->mat = Cmat; 3134 Ccusp->mat->mat = Ccsr; 3135 Ccsr->num_rows = Ccusp->nrows; 3136 Ccsr->num_cols = n; 3137 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3138 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3139 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3140 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3141 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3142 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3143 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3144 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3145 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3146 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3147 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3148 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3149 c->nz = 0; 3150 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3151 Ccsr->values = new THRUSTARRAY(c->nz); 3152 goto finalizesym; 3153 } 3154 3155 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3156 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3157 Acsr = (CsrMatrix *)Amat->mat; 3158 if (!biscompressed) { 3159 Bcsr = (CsrMatrix *)Bmat->mat; 3160 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3161 BmatSpDescr = Bmat->matDescr; 3162 #endif 3163 } else { /* we need to use row offsets for the full matrix */ 3164 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3165 Bcsr = new CsrMatrix; 3166 Bcsr->num_rows = B->rmap->n; 3167 Bcsr->num_cols = cBcsr->num_cols; 3168 Bcsr->num_entries = cBcsr->num_entries; 3169 Bcsr->column_indices = cBcsr->column_indices; 3170 Bcsr->values = cBcsr->values; 3171 if (!Bcusp->rowoffsets_gpu) { 3172 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3173 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3174 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3175 } 3176 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3177 mmdata->Bcsr = Bcsr; 3178 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3179 if (Bcsr->num_rows && Bcsr->num_cols) { 3180 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3181 PetscCallCUSPARSE(stat); 3182 } 3183 BmatSpDescr = mmdata->matSpBDescr; 3184 #endif 3185 } 3186 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3187 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3188 /* precompute flops count */ 3189 if (ptype == MATPRODUCT_AB) { 3190 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3191 const PetscInt st = a->i[i]; 3192 const PetscInt en = a->i[i + 1]; 3193 for (j = st; j < en; j++) { 3194 const PetscInt brow = a->j[j]; 3195 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3196 } 3197 } 3198 } else if (ptype == MATPRODUCT_AtB) { 3199 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3200 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3201 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3202 flops += (2. * anzi) * bnzi; 3203 } 3204 } else { /* TODO */ 3205 flops = 0.; 3206 } 3207 3208 mmdata->flops = flops; 3209 PetscCall(PetscLogGpuTimeBegin()); 3210 3211 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3212 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3213 // cuda-12.2 requires non-null csrRowOffsets 3214 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3215 PetscCallCUSPARSE(stat); 3216 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3217 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3218 { 3219 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3220 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3221 */ 3222 void *dBuffer1 = NULL; 3223 void *dBuffer2 = NULL; 3224 void *dBuffer3 = NULL; 3225 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3226 size_t bufferSize1 = 0; 3227 size_t bufferSize2 = 0; 3228 size_t bufferSize3 = 0; 3229 size_t bufferSize4 = 0; 3230 size_t bufferSize5 = 0; 3231 3232 /* ask bufferSize1 bytes for external memory */ 3233 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3234 PetscCallCUSPARSE(stat); 3235 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3236 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3237 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3238 PetscCallCUSPARSE(stat); 3239 3240 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3241 PetscCallCUSPARSE(stat); 3242 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3243 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3244 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3245 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3246 PetscCallCUSPARSE(stat); 3247 PetscCallCUDA(cudaFree(dBuffer1)); 3248 PetscCallCUDA(cudaFree(dBuffer2)); 3249 3250 /* get matrix C non-zero entries C_nnz1 */ 3251 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3252 c->nz = (PetscInt)C_nnz1; 3253 /* allocate matrix C */ 3254 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3255 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3256 Ccsr->values = new THRUSTARRAY(c->nz); 3257 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3258 /* update matC with the new pointers */ 3259 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3260 PetscCallCUSPARSE(stat); 3261 3262 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3263 PetscCallCUSPARSE(stat); 3264 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3265 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3266 PetscCallCUSPARSE(stat); 3267 PetscCallCUDA(cudaFree(dBuffer3)); 3268 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3269 PetscCallCUSPARSE(stat); 3270 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3271 } 3272 #else 3273 size_t bufSize2; 3274 /* ask bufferSize bytes for external memory */ 3275 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3276 PetscCallCUSPARSE(stat); 3277 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3278 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3279 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3280 PetscCallCUSPARSE(stat); 3281 /* ask bufferSize again bytes for external memory */ 3282 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3283 PetscCallCUSPARSE(stat); 3284 /* The CUSPARSE documentation is not clear, nor the API 3285 We need both buffers to perform the operations properly! 3286 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3287 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3288 is stored in the descriptor! What a messy API... */ 3289 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3290 /* compute the intermediate product of A * B */ 3291 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3292 PetscCallCUSPARSE(stat); 3293 /* get matrix C non-zero entries C_nnz1 */ 3294 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3295 c->nz = (PetscInt)C_nnz1; 3296 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3297 mmdata->mmBufferSize / 1024)); 3298 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3299 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3300 Ccsr->values = new THRUSTARRAY(c->nz); 3301 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3302 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3303 PetscCallCUSPARSE(stat); 3304 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3305 PetscCallCUSPARSE(stat); 3306 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3307 #else 3308 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3309 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3310 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3311 PetscCallCUSPARSE(stat); 3312 c->nz = cnz; 3313 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3314 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3315 Ccsr->values = new THRUSTARRAY(c->nz); 3316 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3317 3318 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3319 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3320 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3321 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3322 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3323 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3324 PetscCallCUSPARSE(stat); 3325 #endif 3326 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3327 PetscCall(PetscLogGpuTimeEnd()); 3328 finalizesym: 3329 c->free_a = PETSC_TRUE; 3330 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3331 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3332 c->free_ij = PETSC_TRUE; 3333 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3334 PetscInt *d_i = c->i; 3335 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3336 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3337 ii = *Ccsr->row_offsets; 3338 jj = *Ccsr->column_indices; 3339 if (ciscompressed) d_i = c->compressedrow.i; 3340 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3341 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3342 } else { 3343 PetscInt *d_i = c->i; 3344 if (ciscompressed) d_i = c->compressedrow.i; 3345 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3346 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3347 } 3348 if (ciscompressed) { /* need to expand host row offsets */ 3349 PetscInt r = 0; 3350 c->i[0] = 0; 3351 for (k = 0; k < c->compressedrow.nrows; k++) { 3352 const PetscInt next = c->compressedrow.rindex[k]; 3353 const PetscInt old = c->compressedrow.i[k]; 3354 for (; r < next; r++) c->i[r + 1] = old; 3355 } 3356 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3357 } 3358 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3359 PetscCall(PetscMalloc1(m, &c->ilen)); 3360 PetscCall(PetscMalloc1(m, &c->imax)); 3361 c->maxnz = c->nz; 3362 c->nonzerorowcnt = 0; 3363 c->rmax = 0; 3364 for (k = 0; k < m; k++) { 3365 const PetscInt nn = c->i[k + 1] - c->i[k]; 3366 c->ilen[k] = c->imax[k] = nn; 3367 c->nonzerorowcnt += (PetscInt)!!nn; 3368 c->rmax = PetscMax(c->rmax, nn); 3369 } 3370 PetscCall(PetscMalloc1(c->nz, &c->a)); 3371 Ccsr->num_entries = c->nz; 3372 3373 C->nonzerostate++; 3374 PetscCall(PetscLayoutSetUp(C->rmap)); 3375 PetscCall(PetscLayoutSetUp(C->cmap)); 3376 Ccusp->nonzerostate = C->nonzerostate; 3377 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3378 C->preallocated = PETSC_TRUE; 3379 C->assembled = PETSC_FALSE; 3380 C->was_assembled = PETSC_FALSE; 3381 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3382 mmdata->reusesym = PETSC_TRUE; 3383 C->offloadmask = PETSC_OFFLOAD_GPU; 3384 } 3385 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3386 PetscFunctionReturn(PETSC_SUCCESS); 3387 } 3388 3389 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3390 3391 /* handles sparse or dense B */ 3392 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3393 { 3394 Mat_Product *product = mat->product; 3395 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3396 3397 PetscFunctionBegin; 3398 MatCheckProduct(mat, 1); 3399 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3400 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3401 if (product->type == MATPRODUCT_ABC) { 3402 Ciscusp = PETSC_FALSE; 3403 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3404 } 3405 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3406 PetscBool usecpu = PETSC_FALSE; 3407 switch (product->type) { 3408 case MATPRODUCT_AB: 3409 if (product->api_user) { 3410 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3411 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3412 PetscOptionsEnd(); 3413 } else { 3414 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3415 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3416 PetscOptionsEnd(); 3417 } 3418 break; 3419 case MATPRODUCT_AtB: 3420 if (product->api_user) { 3421 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3422 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3423 PetscOptionsEnd(); 3424 } else { 3425 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3426 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3427 PetscOptionsEnd(); 3428 } 3429 break; 3430 case MATPRODUCT_PtAP: 3431 if (product->api_user) { 3432 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3433 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3434 PetscOptionsEnd(); 3435 } else { 3436 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3437 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3438 PetscOptionsEnd(); 3439 } 3440 break; 3441 case MATPRODUCT_RARt: 3442 if (product->api_user) { 3443 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3444 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3445 PetscOptionsEnd(); 3446 } else { 3447 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3448 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3449 PetscOptionsEnd(); 3450 } 3451 break; 3452 case MATPRODUCT_ABC: 3453 if (product->api_user) { 3454 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3455 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3456 PetscOptionsEnd(); 3457 } else { 3458 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3459 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3460 PetscOptionsEnd(); 3461 } 3462 break; 3463 default: 3464 break; 3465 } 3466 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3467 } 3468 /* dispatch */ 3469 if (isdense) { 3470 switch (product->type) { 3471 case MATPRODUCT_AB: 3472 case MATPRODUCT_AtB: 3473 case MATPRODUCT_ABt: 3474 case MATPRODUCT_PtAP: 3475 case MATPRODUCT_RARt: 3476 if (product->A->boundtocpu) { 3477 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3478 } else { 3479 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3480 } 3481 break; 3482 case MATPRODUCT_ABC: 3483 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3484 break; 3485 default: 3486 break; 3487 } 3488 } else if (Biscusp && Ciscusp) { 3489 switch (product->type) { 3490 case MATPRODUCT_AB: 3491 case MATPRODUCT_AtB: 3492 case MATPRODUCT_ABt: 3493 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3494 break; 3495 case MATPRODUCT_PtAP: 3496 case MATPRODUCT_RARt: 3497 case MATPRODUCT_ABC: 3498 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3499 break; 3500 default: 3501 break; 3502 } 3503 } else { /* fallback for AIJ */ 3504 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3505 } 3506 PetscFunctionReturn(PETSC_SUCCESS); 3507 } 3508 3509 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3510 { 3511 PetscFunctionBegin; 3512 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3513 PetscFunctionReturn(PETSC_SUCCESS); 3514 } 3515 3516 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3517 { 3518 PetscFunctionBegin; 3519 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3520 PetscFunctionReturn(PETSC_SUCCESS); 3521 } 3522 3523 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3524 { 3525 PetscFunctionBegin; 3526 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3527 PetscFunctionReturn(PETSC_SUCCESS); 3528 } 3529 3530 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3531 { 3532 PetscFunctionBegin; 3533 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3534 PetscFunctionReturn(PETSC_SUCCESS); 3535 } 3536 3537 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3538 { 3539 PetscFunctionBegin; 3540 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3541 PetscFunctionReturn(PETSC_SUCCESS); 3542 } 3543 3544 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3545 { 3546 int i = blockIdx.x * blockDim.x + threadIdx.x; 3547 if (i < n) y[idx[i]] += x[i]; 3548 } 3549 3550 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3551 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3552 { 3553 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3554 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3555 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3556 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3557 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3558 PetscBool compressed; 3559 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3560 PetscInt nx, ny; 3561 #endif 3562 3563 PetscFunctionBegin; 3564 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3565 if (!a->nz) { 3566 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3567 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3568 PetscFunctionReturn(PETSC_SUCCESS); 3569 } 3570 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3571 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3572 if (!trans) { 3573 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3574 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3575 } else { 3576 if (herm || !A->form_explicit_transpose) { 3577 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3578 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3579 } else { 3580 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3581 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3582 } 3583 } 3584 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3585 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3586 3587 try { 3588 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3589 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3590 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3591 3592 PetscCall(PetscLogGpuTimeBegin()); 3593 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3594 /* z = A x + beta y. 3595 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3596 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3597 */ 3598 xptr = xarray; 3599 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3600 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3602 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3603 allocated to accommodate different uses. So we get the length info directly from mat. 3604 */ 3605 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3606 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3607 nx = mat->num_cols; // since y = Ax 3608 ny = mat->num_rows; 3609 } 3610 #endif 3611 } else { 3612 /* z = A^T x + beta y 3613 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3614 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3615 */ 3616 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3617 dptr = zarray; 3618 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3619 if (compressed) { /* Scatter x to work vector */ 3620 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3621 3622 thrust::for_each( 3623 #if PetscDefined(HAVE_THRUST_ASYNC) 3624 thrust::cuda::par.on(PetscDefaultCudaStream), 3625 #endif 3626 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3627 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3628 } 3629 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3630 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3631 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3632 nx = mat->num_rows; // since y = A^T x 3633 ny = mat->num_cols; 3634 } 3635 #endif 3636 } 3637 3638 /* csr_spmv does y = alpha op(A) x + beta y */ 3639 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3640 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3641 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3642 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3643 #else 3644 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3645 #endif 3646 3647 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3648 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3649 if (!matDescr) { 3650 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3651 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3652 } 3653 #endif 3654 3655 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3656 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3657 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3658 PetscCallCUSPARSE( 3659 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3660 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3661 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3662 PetscCallCUSPARSE( 3663 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3664 #endif 3665 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3666 } else { 3667 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3668 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3669 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3670 } 3671 3672 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3673 #else 3674 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3675 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3676 #endif 3677 } else { 3678 if (cusparsestruct->nrows) { 3679 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3680 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3681 #else 3682 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3683 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3684 #endif 3685 } 3686 } 3687 PetscCall(PetscLogGpuTimeEnd()); 3688 3689 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3690 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3691 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3692 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3693 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3694 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3695 } 3696 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3697 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3698 } 3699 3700 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3701 if (compressed) { 3702 PetscCall(PetscLogGpuTimeBegin()); 3703 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3704 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3705 PetscCall(PetscLogGpuTimeEnd()); 3706 } 3707 } else { 3708 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3709 } 3710 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3711 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3712 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3713 } catch (char *ex) { 3714 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3715 } 3716 if (yy) { 3717 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3718 } else { 3719 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3720 } 3721 PetscFunctionReturn(PETSC_SUCCESS); 3722 } 3723 3724 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3725 { 3726 PetscFunctionBegin; 3727 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3728 PetscFunctionReturn(PETSC_SUCCESS); 3729 } 3730 3731 PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx); 3732 3733 __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag) 3734 { 3735 const size_t x = blockIdx.x * blockDim.x + threadIdx.x; 3736 3737 if (x < len) { 3738 const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx; 3739 PetscScalar d = 0.0; 3740 3741 for (PetscInt i = 0; i < num_non0_row; i++) { 3742 if (col[i + rowx] == x) { 3743 d = val[i + rowx]; 3744 break; 3745 } 3746 } 3747 diag[x] = d; 3748 } 3749 } 3750 3751 static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag) 3752 { 3753 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3754 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3755 PetscScalar *darray; 3756 3757 PetscFunctionBegin; 3758 if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) { 3759 PetscInt n = A->rmap->n; 3760 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3761 3762 PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported"); 3763 if (n > 0) { 3764 PetscCall(VecCUDAGetArrayWrite(diag, &darray)); 3765 GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray); 3766 PetscCallCUDA(cudaPeekAtLastError()); 3767 PetscCall(VecCUDARestoreArrayWrite(diag, &darray)); 3768 } 3769 } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag)); 3770 PetscFunctionReturn(PETSC_SUCCESS); 3771 } 3772 3773 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3774 { 3775 PetscFunctionBegin; 3776 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3777 PetscFunctionReturn(PETSC_SUCCESS); 3778 } 3779 3780 /*@ 3781 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs 3782 3783 Collective 3784 3785 Input Parameters: 3786 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3787 . m - number of rows 3788 . n - number of columns 3789 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3790 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3791 3792 Output Parameter: 3793 . A - the matrix 3794 3795 Level: intermediate 3796 3797 Notes: 3798 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3799 calculations. For good matrix assembly performance the user should preallocate the matrix 3800 storage by setting the parameter `nz` (or the array `nnz`). 3801 3802 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3803 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3804 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3805 3806 The AIJ format, also called 3807 compressed row storage, is fully compatible with standard Fortran 3808 storage. That is, the stored row and column indices can begin at 3809 either one (as in Fortran) or zero. 3810 3811 Specify the preallocated storage with either nz or nnz (not both). 3812 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3813 allocation. 3814 3815 When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()` 3816 3817 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`, 3818 `MatSetPreallocationCOO()`, `MatSetValuesCOO()` 3819 @*/ 3820 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3821 { 3822 PetscFunctionBegin; 3823 PetscCall(MatCreate(comm, A)); 3824 PetscCall(MatSetSizes(*A, m, n, m, n)); 3825 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3826 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3827 PetscFunctionReturn(PETSC_SUCCESS); 3828 } 3829 3830 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3831 { 3832 PetscFunctionBegin; 3833 if (A->factortype == MAT_FACTOR_NONE) { 3834 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3835 } else { 3836 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3837 } 3838 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3839 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3840 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3841 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3842 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3843 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3844 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3845 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3846 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3847 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3848 PetscCall(MatDestroy_SeqAIJ(A)); 3849 PetscFunctionReturn(PETSC_SUCCESS); 3850 } 3851 3852 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3853 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3854 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3855 { 3856 PetscFunctionBegin; 3857 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3858 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3859 PetscFunctionReturn(PETSC_SUCCESS); 3860 } 3861 3862 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3863 { 3864 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3865 Mat_SeqAIJCUSPARSE *cy; 3866 Mat_SeqAIJCUSPARSE *cx; 3867 PetscScalar *ay; 3868 const PetscScalar *ax; 3869 CsrMatrix *csry, *csrx; 3870 3871 PetscFunctionBegin; 3872 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3873 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3874 if (X->ops->axpy != Y->ops->axpy) { 3875 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3876 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3877 PetscFunctionReturn(PETSC_SUCCESS); 3878 } 3879 /* if we are here, it means both matrices are bound to GPU */ 3880 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3881 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3882 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3883 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3884 csry = (CsrMatrix *)cy->mat->mat; 3885 csrx = (CsrMatrix *)cx->mat->mat; 3886 /* see if we can turn this into a cublas axpy */ 3887 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3888 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3889 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3890 if (eq) str = SAME_NONZERO_PATTERN; 3891 } 3892 /* spgeam is buggy with one column */ 3893 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3894 3895 if (str == SUBSET_NONZERO_PATTERN) { 3896 PetscScalar b = 1.0; 3897 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3898 size_t bufferSize; 3899 void *buffer; 3900 #endif 3901 3902 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3903 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3904 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3905 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3906 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3907 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3908 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3909 PetscCall(PetscLogGpuTimeBegin()); 3910 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3911 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3912 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3913 PetscCall(PetscLogGpuTimeEnd()); 3914 PetscCallCUDA(cudaFree(buffer)); 3915 #else 3916 PetscCall(PetscLogGpuTimeBegin()); 3917 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3918 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3919 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3920 PetscCall(PetscLogGpuTimeEnd()); 3921 #endif 3922 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3923 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3924 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3925 } else if (str == SAME_NONZERO_PATTERN) { 3926 cublasHandle_t cublasv2handle; 3927 PetscBLASInt one = 1, bnz = 1; 3928 3929 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3930 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3931 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3932 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3933 PetscCall(PetscLogGpuTimeBegin()); 3934 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3935 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3936 PetscCall(PetscLogGpuTimeEnd()); 3937 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3938 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3939 } else { 3940 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3941 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3942 } 3943 PetscFunctionReturn(PETSC_SUCCESS); 3944 } 3945 3946 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3947 { 3948 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3949 PetscScalar *ay; 3950 cublasHandle_t cublasv2handle; 3951 PetscBLASInt one = 1, bnz = 1; 3952 3953 PetscFunctionBegin; 3954 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3955 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3956 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3957 PetscCall(PetscLogGpuTimeBegin()); 3958 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3959 PetscCall(PetscLogGpuFlops(bnz)); 3960 PetscCall(PetscLogGpuTimeEnd()); 3961 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3962 PetscFunctionReturn(PETSC_SUCCESS); 3963 } 3964 3965 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3966 { 3967 PetscBool gpu = PETSC_FALSE; 3968 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3969 3970 PetscFunctionBegin; 3971 if (A->factortype == MAT_FACTOR_NONE) { 3972 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3973 if (spptr->mat) { 3974 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3975 if (matrix->values) { 3976 gpu = PETSC_TRUE; 3977 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3978 } 3979 } 3980 if (spptr->matTranspose) { 3981 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3982 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3983 } 3984 } 3985 if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU; 3986 else { 3987 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3988 A->offloadmask = PETSC_OFFLOAD_CPU; 3989 } 3990 PetscFunctionReturn(PETSC_SUCCESS); 3991 } 3992 3993 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m) 3994 { 3995 PetscFunctionBegin; 3996 *m = PETSC_MEMTYPE_CUDA; 3997 PetscFunctionReturn(PETSC_SUCCESS); 3998 } 3999 4000 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 4001 { 4002 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4003 4004 PetscFunctionBegin; 4005 if (A->factortype != MAT_FACTOR_NONE) { 4006 A->boundtocpu = flg; 4007 PetscFunctionReturn(PETSC_SUCCESS); 4008 } 4009 if (flg) { 4010 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4011 4012 A->ops->scale = MatScale_SeqAIJ; 4013 A->ops->getdiagonal = MatGetDiagonal_SeqAIJ; 4014 A->ops->axpy = MatAXPY_SeqAIJ; 4015 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4016 A->ops->mult = MatMult_SeqAIJ; 4017 A->ops->multadd = MatMultAdd_SeqAIJ; 4018 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4019 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4020 A->ops->multhermitiantranspose = NULL; 4021 A->ops->multhermitiantransposeadd = NULL; 4022 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4023 A->ops->getcurrentmemtype = NULL; 4024 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 4025 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 4026 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 4027 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 4028 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 4029 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 4030 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 4031 } else { 4032 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4033 A->ops->getdiagonal = MatGetDiagonal_SeqAIJCUSPARSE; 4034 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4035 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4036 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4037 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4038 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4039 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4040 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4041 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4042 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4043 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4044 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4045 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4046 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4047 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4048 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4049 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4050 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4051 4052 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4053 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4054 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4055 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4056 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4057 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4058 } 4059 A->boundtocpu = flg; 4060 if (flg && a->inode.size_csr) { 4061 a->inode.use = PETSC_TRUE; 4062 } else { 4063 a->inode.use = PETSC_FALSE; 4064 } 4065 PetscFunctionReturn(PETSC_SUCCESS); 4066 } 4067 4068 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4069 { 4070 Mat B; 4071 4072 PetscFunctionBegin; 4073 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4074 if (reuse == MAT_INITIAL_MATRIX) { 4075 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4076 } else if (reuse == MAT_REUSE_MATRIX) { 4077 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4078 } 4079 B = *newmat; 4080 4081 PetscCall(PetscFree(B->defaultvectype)); 4082 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4083 4084 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4085 if (B->factortype == MAT_FACTOR_NONE) { 4086 Mat_SeqAIJCUSPARSE *spptr; 4087 PetscCall(PetscNew(&spptr)); 4088 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4089 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4090 spptr->format = MAT_CUSPARSE_CSR; 4091 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4092 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4093 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4094 #else 4095 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4096 #endif 4097 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4098 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4099 #endif 4100 B->spptr = spptr; 4101 } else { 4102 Mat_SeqAIJCUSPARSETriFactors *spptr; 4103 4104 PetscCall(PetscNew(&spptr)); 4105 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4106 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4107 B->spptr = spptr; 4108 } 4109 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4110 } 4111 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4112 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4113 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4114 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4115 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4116 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4117 B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4118 4119 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4120 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4121 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4122 #if defined(PETSC_HAVE_HYPRE) 4123 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4124 #endif 4125 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4126 PetscFunctionReturn(PETSC_SUCCESS); 4127 } 4128 4129 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4130 { 4131 PetscFunctionBegin; 4132 PetscCall(MatCreate_SeqAIJ(B)); 4133 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4134 PetscFunctionReturn(PETSC_SUCCESS); 4135 } 4136 4137 /*MC 4138 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs. 4139 4140 Options Database Keys: 4141 + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4142 . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4143 Other options include ell (ellpack) or hyb (hybrid). 4144 . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4145 - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU 4146 4147 Level: beginner 4148 4149 Notes: 4150 These matrices can be in either CSR, ELL, or HYB format. 4151 4152 All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library. 4153 4154 Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens 4155 if some integer values passed in do not fit in `int`. 4156 4157 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4158 M*/ 4159 4160 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4161 { 4162 PetscFunctionBegin; 4163 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4164 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4165 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4166 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4167 PetscFunctionReturn(PETSC_SUCCESS); 4168 } 4169 4170 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4171 { 4172 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4173 4174 PetscFunctionBegin; 4175 if (cusp) { 4176 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4177 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4178 delete cusp->workVector; 4179 delete cusp->rowoffsets_gpu; 4180 delete cusp->csr2csc_i; 4181 delete cusp->coords; 4182 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4183 PetscCall(PetscFree(mat->spptr)); 4184 } 4185 PetscFunctionReturn(PETSC_SUCCESS); 4186 } 4187 4188 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4189 { 4190 PetscFunctionBegin; 4191 if (*mat) { 4192 delete (*mat)->values; 4193 delete (*mat)->column_indices; 4194 delete (*mat)->row_offsets; 4195 delete *mat; 4196 *mat = 0; 4197 } 4198 PetscFunctionReturn(PETSC_SUCCESS); 4199 } 4200 4201 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4202 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4203 { 4204 PetscFunctionBegin; 4205 if (*trifactor) { 4206 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4207 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4208 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4209 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4210 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4211 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4212 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4213 #endif 4214 PetscCall(PetscFree(*trifactor)); 4215 } 4216 PetscFunctionReturn(PETSC_SUCCESS); 4217 } 4218 #endif 4219 4220 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4221 { 4222 CsrMatrix *mat; 4223 4224 PetscFunctionBegin; 4225 if (*matstruct) { 4226 if ((*matstruct)->mat) { 4227 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4228 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4229 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4230 #else 4231 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4232 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4233 #endif 4234 } else { 4235 mat = (CsrMatrix *)(*matstruct)->mat; 4236 PetscCall(CsrMatrix_Destroy(&mat)); 4237 } 4238 } 4239 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4240 delete (*matstruct)->cprowIndices; 4241 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4242 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4243 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4244 4245 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4246 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4247 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4248 4249 for (int i = 0; i < 3; i++) { 4250 if (mdata->cuSpMV[i].initialized) { 4251 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4252 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4253 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4254 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4255 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4256 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4257 #endif 4258 } 4259 } 4260 #endif 4261 delete *matstruct; 4262 *matstruct = NULL; 4263 } 4264 PetscFunctionReturn(PETSC_SUCCESS); 4265 } 4266 4267 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4268 { 4269 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4270 4271 PetscFunctionBegin; 4272 if (fs) { 4273 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4274 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4275 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4276 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4277 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4278 delete fs->workVector; 4279 fs->workVector = NULL; 4280 #endif 4281 delete fs->rpermIndices; 4282 delete fs->cpermIndices; 4283 fs->rpermIndices = NULL; 4284 fs->cpermIndices = NULL; 4285 fs->init_dev_prop = PETSC_FALSE; 4286 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4287 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4288 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4289 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4290 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4291 PetscCallCUDA(cudaFree(fs->csrVal)); 4292 PetscCallCUDA(cudaFree(fs->diag)); 4293 PetscCallCUDA(cudaFree(fs->X)); 4294 PetscCallCUDA(cudaFree(fs->Y)); 4295 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4296 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4297 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4298 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4299 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4300 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4301 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4302 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4303 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4304 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4305 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4306 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4307 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4308 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4309 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4310 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4311 PetscCall(PetscFree(fs->csrRowPtr_h)); 4312 PetscCall(PetscFree(fs->csrVal_h)); 4313 PetscCall(PetscFree(fs->diag_h)); 4314 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4315 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4316 #endif 4317 } 4318 PetscFunctionReturn(PETSC_SUCCESS); 4319 } 4320 4321 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4322 { 4323 PetscFunctionBegin; 4324 if (*trifactors) { 4325 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4326 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4327 PetscCall(PetscFree(*trifactors)); 4328 } 4329 PetscFunctionReturn(PETSC_SUCCESS); 4330 } 4331 4332 struct IJCompare { 4333 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4334 { 4335 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4336 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4337 return false; 4338 } 4339 }; 4340 4341 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4342 { 4343 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4344 4345 PetscFunctionBegin; 4346 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4347 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4348 if (destroy) { 4349 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4350 delete cusp->csr2csc_i; 4351 cusp->csr2csc_i = NULL; 4352 } 4353 A->transupdated = PETSC_FALSE; 4354 PetscFunctionReturn(PETSC_SUCCESS); 4355 } 4356 4357 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4358 { 4359 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 4360 4361 PetscFunctionBegin; 4362 PetscCallCUDA(cudaFree(coo->perm)); 4363 PetscCallCUDA(cudaFree(coo->jmap)); 4364 PetscCall(PetscFree(coo)); 4365 PetscFunctionReturn(PETSC_SUCCESS); 4366 } 4367 4368 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4369 { 4370 PetscBool dev_ij = PETSC_FALSE; 4371 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4372 PetscInt *i, *j; 4373 PetscContainer container_h; 4374 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4375 4376 PetscFunctionBegin; 4377 PetscCall(PetscGetMemType(coo_i, &mtype)); 4378 if (PetscMemTypeDevice(mtype)) { 4379 dev_ij = PETSC_TRUE; 4380 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4381 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4382 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4383 } else { 4384 i = coo_i; 4385 j = coo_j; 4386 } 4387 4388 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4389 if (dev_ij) PetscCall(PetscFree2(i, j)); 4390 mat->offloadmask = PETSC_OFFLOAD_CPU; 4391 // Create the GPU memory 4392 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4393 4394 // Copy the COO struct to device 4395 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4396 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4397 PetscCall(PetscMalloc1(1, &coo_d)); 4398 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4399 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4400 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4401 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4402 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4403 4404 // Put the COO struct in a container and then attach that to the matrix 4405 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4406 PetscFunctionReturn(PETSC_SUCCESS); 4407 } 4408 4409 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4410 { 4411 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4412 const PetscCount grid_size = gridDim.x * blockDim.x; 4413 for (; i < nnz; i += grid_size) { 4414 PetscScalar sum = 0.0; 4415 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4416 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4417 } 4418 } 4419 4420 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4421 { 4422 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4423 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4424 PetscCount Annz = seq->nz; 4425 PetscMemType memtype; 4426 const PetscScalar *v1 = v; 4427 PetscScalar *Aa; 4428 PetscContainer container; 4429 MatCOOStruct_SeqAIJ *coo; 4430 4431 PetscFunctionBegin; 4432 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4433 4434 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4435 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4436 4437 PetscCall(PetscGetMemType(v, &memtype)); 4438 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4439 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4440 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4441 } 4442 4443 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4444 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4445 4446 PetscCall(PetscLogGpuTimeBegin()); 4447 if (Annz) { 4448 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4449 PetscCallCUDA(cudaPeekAtLastError()); 4450 } 4451 PetscCall(PetscLogGpuTimeEnd()); 4452 4453 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4454 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4455 4456 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4457 PetscFunctionReturn(PETSC_SUCCESS); 4458 } 4459 4460 /*@C 4461 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4462 4463 Not Collective 4464 4465 Input Parameters: 4466 + A - the matrix 4467 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4468 4469 Output Parameters: 4470 + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 4471 - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 4472 4473 Level: developer 4474 4475 Note: 4476 When compressed is true, the CSR structure does not contain empty rows 4477 4478 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4479 @*/ 4480 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4481 { 4482 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4483 CsrMatrix *csr; 4484 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4485 4486 PetscFunctionBegin; 4487 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4488 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4489 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4490 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4491 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4492 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4493 csr = (CsrMatrix *)cusp->mat->mat; 4494 if (i) { 4495 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4496 if (!cusp->rowoffsets_gpu) { 4497 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4498 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4499 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4500 } 4501 *i = cusp->rowoffsets_gpu->data().get(); 4502 } else *i = csr->row_offsets->data().get(); 4503 } 4504 if (j) *j = csr->column_indices->data().get(); 4505 PetscFunctionReturn(PETSC_SUCCESS); 4506 } 4507 4508 /*@C 4509 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4510 4511 Not Collective 4512 4513 Input Parameters: 4514 + A - the matrix 4515 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4516 . i - the CSR row pointers 4517 - j - the CSR column indices 4518 4519 Level: developer 4520 4521 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4522 @*/ 4523 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4524 { 4525 PetscFunctionBegin; 4526 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4527 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4528 if (i) *i = NULL; 4529 if (j) *j = NULL; 4530 (void)compressed; 4531 PetscFunctionReturn(PETSC_SUCCESS); 4532 } 4533 4534 /*@C 4535 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored 4536 4537 Not Collective 4538 4539 Input Parameter: 4540 . A - a `MATSEQAIJCUSPARSE` matrix 4541 4542 Output Parameter: 4543 . a - pointer to the device data 4544 4545 Level: developer 4546 4547 Note: 4548 Will trigger host-to-device copies if the most up-to-date matrix data is on the host 4549 4550 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4551 @*/ 4552 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4553 { 4554 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4555 CsrMatrix *csr; 4556 4557 PetscFunctionBegin; 4558 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4559 PetscAssertPointer(a, 2); 4560 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4561 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4562 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4563 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4564 csr = (CsrMatrix *)cusp->mat->mat; 4565 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4566 *a = csr->values->data().get(); 4567 PetscFunctionReturn(PETSC_SUCCESS); 4568 } 4569 4570 /*@C 4571 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4572 4573 Not Collective 4574 4575 Input Parameters: 4576 + A - a `MATSEQAIJCUSPARSE` matrix 4577 - a - pointer to the device data 4578 4579 Level: developer 4580 4581 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4582 @*/ 4583 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4584 { 4585 PetscFunctionBegin; 4586 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4587 PetscAssertPointer(a, 2); 4588 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4589 *a = NULL; 4590 PetscFunctionReturn(PETSC_SUCCESS); 4591 } 4592 4593 /*@C 4594 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4595 4596 Not Collective 4597 4598 Input Parameter: 4599 . A - a `MATSEQAIJCUSPARSE` matrix 4600 4601 Output Parameter: 4602 . a - pointer to the device data 4603 4604 Level: developer 4605 4606 Note: 4607 Will trigger host-to-device copies if the most up-to-date matrix data is on the host 4608 4609 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4610 @*/ 4611 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4612 { 4613 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4614 CsrMatrix *csr; 4615 4616 PetscFunctionBegin; 4617 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4618 PetscAssertPointer(a, 2); 4619 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4620 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4621 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4622 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4623 csr = (CsrMatrix *)cusp->mat->mat; 4624 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4625 *a = csr->values->data().get(); 4626 A->offloadmask = PETSC_OFFLOAD_GPU; 4627 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4628 PetscFunctionReturn(PETSC_SUCCESS); 4629 } 4630 /*@C 4631 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4632 4633 Not Collective 4634 4635 Input Parameters: 4636 + A - a `MATSEQAIJCUSPARSE` matrix 4637 - a - pointer to the device data 4638 4639 Level: developer 4640 4641 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4642 @*/ 4643 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4644 { 4645 PetscFunctionBegin; 4646 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4647 PetscAssertPointer(a, 2); 4648 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4649 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4650 *a = NULL; 4651 PetscFunctionReturn(PETSC_SUCCESS); 4652 } 4653 4654 /*@C 4655 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4656 4657 Not Collective 4658 4659 Input Parameter: 4660 . A - a `MATSEQAIJCUSPARSE` matrix 4661 4662 Output Parameter: 4663 . a - pointer to the device data 4664 4665 Level: developer 4666 4667 Note: 4668 Does not trigger any host to device copies. 4669 4670 It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current 4671 4672 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4673 @*/ 4674 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4675 { 4676 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4677 CsrMatrix *csr; 4678 4679 PetscFunctionBegin; 4680 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4681 PetscAssertPointer(a, 2); 4682 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4683 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4684 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4685 csr = (CsrMatrix *)cusp->mat->mat; 4686 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4687 *a = csr->values->data().get(); 4688 A->offloadmask = PETSC_OFFLOAD_GPU; 4689 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4690 PetscFunctionReturn(PETSC_SUCCESS); 4691 } 4692 4693 /*@C 4694 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4695 4696 Not Collective 4697 4698 Input Parameters: 4699 + A - a `MATSEQAIJCUSPARSE` matrix 4700 - a - pointer to the device data 4701 4702 Level: developer 4703 4704 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4705 @*/ 4706 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4707 { 4708 PetscFunctionBegin; 4709 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4710 PetscAssertPointer(a, 2); 4711 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4712 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4713 *a = NULL; 4714 PetscFunctionReturn(PETSC_SUCCESS); 4715 } 4716 4717 struct IJCompare4 { 4718 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4719 { 4720 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4721 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4722 return false; 4723 } 4724 }; 4725 4726 struct Shift { 4727 int _shift; 4728 4729 Shift(int shift) : _shift(shift) { } 4730 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4731 }; 4732 4733 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4734 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4735 { 4736 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4737 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4738 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4739 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4740 PetscInt Annz, Bnnz; 4741 cusparseStatus_t stat; 4742 PetscInt i, m, n, zero = 0; 4743 4744 PetscFunctionBegin; 4745 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4746 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4747 PetscAssertPointer(C, 4); 4748 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4749 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4750 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4751 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4752 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4753 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4754 if (reuse == MAT_INITIAL_MATRIX) { 4755 m = A->rmap->n; 4756 n = A->cmap->n + B->cmap->n; 4757 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4758 PetscCall(MatSetSizes(*C, m, n, m, n)); 4759 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4760 c = (Mat_SeqAIJ *)(*C)->data; 4761 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4762 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4763 Ccsr = new CsrMatrix; 4764 Cmat->cprowIndices = NULL; 4765 c->compressedrow.use = PETSC_FALSE; 4766 c->compressedrow.nrows = 0; 4767 c->compressedrow.i = NULL; 4768 c->compressedrow.rindex = NULL; 4769 Ccusp->workVector = NULL; 4770 Ccusp->nrows = m; 4771 Ccusp->mat = Cmat; 4772 Ccusp->mat->mat = Ccsr; 4773 Ccsr->num_rows = m; 4774 Ccsr->num_cols = n; 4775 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4776 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4777 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4778 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4779 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4780 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4781 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4782 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4783 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4784 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4785 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4786 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4787 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4788 4789 Acsr = (CsrMatrix *)Acusp->mat->mat; 4790 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4791 Annz = (PetscInt)Acsr->column_indices->size(); 4792 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4793 c->nz = Annz + Bnnz; 4794 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4795 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4796 Ccsr->values = new THRUSTARRAY(c->nz); 4797 Ccsr->num_entries = c->nz; 4798 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4799 if (c->nz) { 4800 auto Acoo = new THRUSTINTARRAY32(Annz); 4801 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4802 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4803 THRUSTINTARRAY32 *Aroff, *Broff; 4804 4805 if (a->compressedrow.use) { /* need full row offset */ 4806 if (!Acusp->rowoffsets_gpu) { 4807 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4808 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4809 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4810 } 4811 Aroff = Acusp->rowoffsets_gpu; 4812 } else Aroff = Acsr->row_offsets; 4813 if (b->compressedrow.use) { /* need full row offset */ 4814 if (!Bcusp->rowoffsets_gpu) { 4815 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4816 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4817 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4818 } 4819 Broff = Bcusp->rowoffsets_gpu; 4820 } else Broff = Bcsr->row_offsets; 4821 PetscCall(PetscLogGpuTimeBegin()); 4822 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4823 PetscCallCUSPARSE(stat); 4824 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4825 PetscCallCUSPARSE(stat); 4826 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4827 auto Aperm = thrust::make_constant_iterator(1); 4828 auto Bperm = thrust::make_constant_iterator(0); 4829 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4830 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4831 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4832 #else 4833 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4834 auto Bcib = Bcsr->column_indices->begin(); 4835 auto Bcie = Bcsr->column_indices->end(); 4836 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4837 #endif 4838 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4839 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4840 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4841 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4842 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4843 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4844 auto p1 = Ccusp->coords->begin(); 4845 auto p2 = Ccusp->coords->begin(); 4846 thrust::advance(p2, Annz); 4847 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4848 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4849 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4850 #endif 4851 auto cci = thrust::make_counting_iterator(zero); 4852 auto cce = thrust::make_counting_iterator(c->nz); 4853 #if 0 //Errors on SUMMIT cuda 11.1.0 4854 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4855 #else 4856 #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST) 4857 auto pred = thrust::identity<int>(); 4858 #else 4859 auto pred = cuda::std::identity(); 4860 #endif 4861 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4862 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4863 #endif 4864 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4865 PetscCallCUSPARSE(stat); 4866 PetscCall(PetscLogGpuTimeEnd()); 4867 delete wPerm; 4868 delete Acoo; 4869 delete Bcoo; 4870 delete Ccoo; 4871 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4872 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4873 PetscCallCUSPARSE(stat); 4874 #endif 4875 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4876 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4877 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4878 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4879 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4880 CsrMatrix *CcsrT = new CsrMatrix; 4881 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4882 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4883 4884 (*C)->form_explicit_transpose = PETSC_TRUE; 4885 (*C)->transupdated = PETSC_TRUE; 4886 Ccusp->rowoffsets_gpu = NULL; 4887 CmatT->cprowIndices = NULL; 4888 CmatT->mat = CcsrT; 4889 CcsrT->num_rows = n; 4890 CcsrT->num_cols = m; 4891 CcsrT->num_entries = c->nz; 4892 4893 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4894 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4895 CcsrT->values = new THRUSTARRAY(c->nz); 4896 4897 PetscCall(PetscLogGpuTimeBegin()); 4898 auto rT = CcsrT->row_offsets->begin(); 4899 if (AT) { 4900 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4901 thrust::advance(rT, -1); 4902 } 4903 if (BT) { 4904 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4905 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4906 thrust::copy(titb, tite, rT); 4907 } 4908 auto cT = CcsrT->column_indices->begin(); 4909 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4910 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4911 auto vT = CcsrT->values->begin(); 4912 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4913 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4914 PetscCall(PetscLogGpuTimeEnd()); 4915 4916 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4917 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4918 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4919 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4920 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4921 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4922 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4923 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4924 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4925 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4926 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4927 PetscCallCUSPARSE(stat); 4928 #endif 4929 Ccusp->matTranspose = CmatT; 4930 } 4931 } 4932 4933 c->free_a = PETSC_TRUE; 4934 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4935 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4936 c->free_ij = PETSC_TRUE; 4937 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4938 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4939 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4940 ii = *Ccsr->row_offsets; 4941 jj = *Ccsr->column_indices; 4942 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4943 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4944 } else { 4945 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4946 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4947 } 4948 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4949 PetscCall(PetscMalloc1(m, &c->ilen)); 4950 PetscCall(PetscMalloc1(m, &c->imax)); 4951 c->maxnz = c->nz; 4952 c->nonzerorowcnt = 0; 4953 c->rmax = 0; 4954 for (i = 0; i < m; i++) { 4955 const PetscInt nn = c->i[i + 1] - c->i[i]; 4956 c->ilen[i] = c->imax[i] = nn; 4957 c->nonzerorowcnt += (PetscInt)!!nn; 4958 c->rmax = PetscMax(c->rmax, nn); 4959 } 4960 PetscCall(PetscMalloc1(c->nz, &c->a)); 4961 (*C)->nonzerostate++; 4962 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4963 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4964 Ccusp->nonzerostate = (*C)->nonzerostate; 4965 (*C)->preallocated = PETSC_TRUE; 4966 } else { 4967 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4968 c = (Mat_SeqAIJ *)(*C)->data; 4969 if (c->nz) { 4970 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4971 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4972 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4973 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4974 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4975 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4976 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4977 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4978 Acsr = (CsrMatrix *)Acusp->mat->mat; 4979 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4980 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4981 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4982 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4983 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4984 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4985 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4986 auto pmid = Ccusp->coords->begin(); 4987 thrust::advance(pmid, Acsr->num_entries); 4988 PetscCall(PetscLogGpuTimeBegin()); 4989 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4990 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4991 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4992 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4993 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4994 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4995 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4996 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4997 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4998 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4999 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 5000 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 5001 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 5002 auto vT = CcsrT->values->begin(); 5003 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 5004 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 5005 (*C)->transupdated = PETSC_TRUE; 5006 } 5007 PetscCall(PetscLogGpuTimeEnd()); 5008 } 5009 } 5010 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5011 (*C)->assembled = PETSC_TRUE; 5012 (*C)->was_assembled = PETSC_FALSE; 5013 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5014 PetscFunctionReturn(PETSC_SUCCESS); 5015 } 5016 5017 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5018 { 5019 bool dmem; 5020 const PetscScalar *av; 5021 5022 PetscFunctionBegin; 5023 dmem = isCudaMem(v); 5024 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5025 if (n && idx) { 5026 THRUSTINTARRAY widx(n); 5027 widx.assign(idx, idx + n); 5028 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5029 5030 THRUSTARRAY *w = NULL; 5031 thrust::device_ptr<PetscScalar> dv; 5032 if (dmem) { 5033 dv = thrust::device_pointer_cast(v); 5034 } else { 5035 w = new THRUSTARRAY(n); 5036 dv = w->data(); 5037 } 5038 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5039 5040 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5041 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5042 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5043 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5044 delete w; 5045 } else { 5046 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5047 } 5048 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5049 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5050 PetscFunctionReturn(PETSC_SUCCESS); 5051 } 5052