1 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 2 3 EXTERN_C_BEGIN 4 #undef __FUNCT__ 5 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 6 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 7 { 8 Mat_MPIAIJ *b; 9 PetscErrorCode ierr; 10 PetscInt i; 11 12 PetscFunctionBegin; 13 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 14 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 15 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 16 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 17 18 ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr); 19 ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr); 20 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 21 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 22 if (d_nnz) { 23 for (i=0; i<B->rmap->n; i++) { 24 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 25 } 26 } 27 if (o_nnz) { 28 for (i=0; i<B->rmap->n; i++) { 29 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 30 } 31 } 32 b = (Mat_MPIAIJ*)B->data; 33 34 if (!B->preallocated) { 35 /* Explicitly create 2 MATSEQAIJ matrices. */ 36 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 37 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 38 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 39 ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 40 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 41 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 42 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 43 ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 44 } 45 46 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 47 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 48 ierr=MatSetOption_SeqAIJCUSPARSE(b->A,b->diagGPUMatFormat,PETSC_TRUE);CHKERRQ(ierr); 49 ierr=MatSetOption_SeqAIJCUSPARSE(b->B,b->offdiagGPUMatFormat,PETSC_TRUE);CHKERRQ(ierr); 50 B->preallocated = PETSC_TRUE; 51 PetscFunctionReturn(0); 52 } 53 EXTERN_C_END 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 56 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 57 { 58 PetscErrorCode ierr; 59 60 PetscFunctionBegin; 61 if (right) { 62 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 63 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 64 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 65 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 66 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 67 } 68 if (left) { 69 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 70 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 71 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 72 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 73 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 74 } 75 PetscFunctionReturn(0); 76 } 77 78 79 #undef __FUNCT__ 80 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 81 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 82 { 83 // This multiplication sequence is different sequence 84 // than the CPU version. In particular, the diagonal block 85 // multiplication kernel is launched in one stream. Then, 86 // in a separate stream, the data transfers from DeviceToHost 87 // (with MPI messaging in between), then HostToDevice are 88 // launched. Once the data transfer stream is synchronized, 89 // to ensure messaging is complete, the MatMultAdd kernel 90 // is launched in the original (MatMult) stream to protect 91 // against race conditions. 92 // 93 // This sequence should only be called for GPU computation. 94 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 95 PetscErrorCode ierr; 96 PetscInt nt; 97 98 PetscFunctionBegin; 99 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 100 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 101 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 102 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 103 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 104 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 105 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 106 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 107 PetscFunctionReturn(0); 108 } 109 110 #undef __FUNCT__ 111 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 112 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 113 { 114 // This multiplication sequence is different sequence 115 // than the CPU version. In particular, the diagonal block 116 // multiplication kernel is launched in one stream. Then, 117 // in a separate stream, the data transfers from DeviceToHost 118 // (with MPI messaging in between), then HostToDevice are 119 // launched. Once the data transfer stream is synchronized, 120 // to ensure messaging is complete, the MatMultAdd kernel 121 // is launched in the original (MatMult) stream to protect 122 // against race conditions. 123 // 124 // This sequence should only be called for GPU computation. 125 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 126 PetscErrorCode ierr; 127 PetscInt nt; 128 129 PetscFunctionBegin; 130 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 131 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 132 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 133 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 134 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 135 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 136 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 137 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 138 PetscFunctionReturn(0); 139 } 140 141 EXTERN_C_BEGIN 142 PetscErrorCode MatCreate_MPIAIJ(Mat); 143 EXTERN_C_END 144 //PetscErrorCode MatSetValuesBatch_MPIAIJCUSPARSE(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats); 145 146 147 #undef __FUNCT__ 148 #define __FUNCT__ "MatSetOption_MPIAIJCUSPARSE" 149 PetscErrorCode MatSetOption_MPIAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 150 { 151 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 152 PetscErrorCode ierr; 153 154 PetscFunctionBegin; 155 ierr = MatSetOption_MPIAIJ(A,op,flg);CHKERRQ(ierr); 156 switch (op) { 157 case DIAGBLOCK_MAT_CSR: 158 a->diagGPUMatFormat = DIAGBLOCK_MAT_CSR; 159 break; 160 case OFFDIAGBLOCK_MAT_CSR: 161 a->offdiagGPUMatFormat = OFFDIAGBLOCK_MAT_CSR; 162 break; 163 case DIAGBLOCK_MAT_DIA: 164 case OFFDIAGBLOCK_MAT_DIA: 165 case MAT_DIA: 166 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type."); 167 case DIAGBLOCK_MAT_ELL: 168 a->diagGPUMatFormat = DIAGBLOCK_MAT_ELL; 169 break; 170 case OFFDIAGBLOCK_MAT_ELL: 171 a->offdiagGPUMatFormat = OFFDIAGBLOCK_MAT_ELL; 172 break; 173 case DIAGBLOCK_MAT_HYB: 174 a->diagGPUMatFormat = DIAGBLOCK_MAT_HYB; 175 break; 176 case OFFDIAGBLOCK_MAT_HYB: 177 a->offdiagGPUMatFormat = OFFDIAGBLOCK_MAT_HYB; 178 break; 179 default: 180 break; 181 } 182 PetscFunctionReturn(0); 183 } 184 185 186 EXTERN_C_BEGIN 187 #undef __FUNCT__ 188 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 189 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 190 { 191 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 192 PetscErrorCode ierr; 193 PetscInt idxDiag=0,idxOffDiag=0; 194 char * formats[]={CSR,ELL,HYB}; 195 MatOption diagFormat, offdiagFormat; 196 PetscBool flg; 197 PetscFunctionBegin; 198 ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, MPIAIJCUSPARSE Options","Mat");CHKERRQ(ierr); 199 if (A->factortype==MAT_FACTOR_NONE) { 200 ierr = PetscOptionsEList("-mat_mult_cusparse_diag_storage_format", 201 "Set the storage format of (mpi)aijcusparse gpu matrices for SpMV", 202 "None",formats,3,formats[0],&idxDiag,&flg);CHKERRQ(ierr); 203 ierr = PetscOptionsEList("-mat_mult_cusparse_offdiag_storage_format", 204 "Set the storage format of (mpi)aijcusparse gpu matrices for SpMV", 205 "None",formats,3,formats[0],&idxOffDiag,&flg);CHKERRQ(ierr); 206 207 //printf("MatSetFromOptions_MPIAIJCUSPARSE : %s %s\n",formats[idxDiag],formats[idxOffDiag]); 208 if (formats[idxDiag] == CSR) 209 diagFormat=MAT_CSR; 210 else if (formats[idxDiag] == ELL) 211 diagFormat=MAT_ELL; 212 else if (formats[idxDiag] == HYB) 213 diagFormat=MAT_HYB; 214 215 if (formats[idxOffDiag] == CSR) 216 offdiagFormat=MAT_CSR; 217 else if (formats[idxOffDiag] == ELL) 218 offdiagFormat=MAT_ELL; 219 else if (formats[idxOffDiag] == HYB) 220 offdiagFormat=MAT_HYB; 221 222 a->diagGPUMatFormat = diagFormat; 223 a->offdiagGPUMatFormat = offdiagFormat; 224 } 225 ierr = PetscOptionsEnd();CHKERRQ(ierr); 226 PetscFunctionReturn(0); 227 } 228 EXTERN_C_END 229 230 231 EXTERN_C_BEGIN 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 234 PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat B) 235 { 236 PetscErrorCode ierr; 237 238 PetscFunctionBegin; 239 ierr = MatCreate_MPIAIJ(B);CHKERRQ(ierr); 240 ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatMPIAIJSetPreallocation_C", 241 "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE", 242 MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 243 B->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 244 B->ops->mult = MatMult_MPIAIJCUSPARSE; 245 B->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 246 B->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 247 B->ops->setoption = MatSetOption_MPIAIJCUSPARSE; 248 ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 249 PetscFunctionReturn(0); 250 } 251 EXTERN_C_END 252 253 254 #undef __FUNCT__ 255 #define __FUNCT__ "MatCreateAIJCUSPARSE" 256 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 257 { 258 PetscErrorCode ierr; 259 PetscMPIInt size; 260 261 PetscFunctionBegin; 262 ierr = MatCreate(comm,A);CHKERRQ(ierr); 263 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 264 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 265 if (size > 1) { 266 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 267 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 268 } else { 269 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 270 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 271 } 272 PetscFunctionReturn(0); 273 } 274 275 /*MC 276 MATAIJCUSPARSE - MATAIJCUSPARSE = "aijcusparse" - A matrix type to be used for sparse matrices. 277 278 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 279 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 280 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 281 for communicators controlling multiple processes. It is recommended that you call both of 282 the above preallocation routines for simplicity. 283 284 Options Database Keys: 285 . -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 286 287 Level: beginner 288 289 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE 290 M*/ 291 292