1 #include <petscdevice_cuda.h>
2 #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>
3
4 /* A class that manages helper arrays assisting parallel PCApply() with CUDA */
5 struct PC_VPBJacobi_CUDA {
6 /* Cache the old sizes to check if we need realloc */
7 PetscInt n; /* number of rows of the local matrix */
8 PetscInt nblocks; /* number of point blocks */
9 PetscInt nsize; /* sum of sizes of the point blocks */
10
11 /* Helper arrays that are pre-computed on host and then copied to device.
12 bs: [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n.
13 bs2: [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize.
14 matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
15 */
16 PetscInt *bs_h, *bs2_h, *matIdx_h;
17 PetscInt *bs_d, *bs2_d, *matIdx_d;
18
19 MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */
20
PC_VPBJacobi_CUDAPC_VPBJacobi_CUDA21 PC_VPBJacobi_CUDA(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_h) : n(n), nblocks(nblocks), nsize(nsize)
22 {
23 /* malloc memory on host and device, and then update */
24 PetscCallVoid(PetscMalloc3(nblocks + 1, &bs_h, nblocks + 1, &bs2_h, n, &matIdx_h));
25 PetscCallCUDAVoid(cudaMalloc(&bs_d, sizeof(PetscInt) * (nblocks + 1)));
26 PetscCallCUDAVoid(cudaMalloc(&bs2_d, sizeof(PetscInt) * (nblocks + 1)));
27 PetscCallCUDAVoid(cudaMalloc(&matIdx_d, sizeof(PetscInt) * n));
28 PetscCallCUDAVoid(cudaMalloc(&diag_d, sizeof(MatScalar) * nsize));
29 PetscCallVoid(UpdateOffsetsOnDevice(bsizes, diag_h));
30 }
31
UpdateOffsetsOnDevicePC_VPBJacobi_CUDA32 PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_h)
33 {
34 PetscFunctionBegin;
35 PetscCall(ComputeOffsetsOnHost(bsizes));
36 PetscCallCUDA(cudaMemcpy(bs_d, bs_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice));
37 PetscCallCUDA(cudaMemcpy(bs2_d, bs2_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice));
38 PetscCallCUDA(cudaMemcpy(matIdx_d, matIdx_h, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
39 PetscCallCUDA(cudaMemcpy(diag_d, diag_h, sizeof(MatScalar) * nsize, cudaMemcpyHostToDevice));
40 PetscCall(PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize));
41 PetscFunctionReturn(PETSC_SUCCESS);
42 }
43
~PC_VPBJacobi_CUDAPC_VPBJacobi_CUDA44 ~PC_VPBJacobi_CUDA()
45 {
46 PetscCallVoid(PetscFree3(bs_h, bs2_h, matIdx_h));
47 PetscCallCUDAVoid(cudaFree(bs_d));
48 PetscCallCUDAVoid(cudaFree(bs2_d));
49 PetscCallCUDAVoid(cudaFree(matIdx_d));
50 PetscCallCUDAVoid(cudaFree(diag_d));
51 }
52
53 private:
ComputeOffsetsOnHostPC_VPBJacobi_CUDA54 PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
55 {
56 PetscFunctionBegin;
57 bs_h[0] = bs2_h[0] = 0;
58 for (PetscInt i = 0; i < nblocks; i++) {
59 bs_h[i + 1] = bs_h[i] + bsizes[i];
60 bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i];
61 for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i;
62 }
63 PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 };
66
67 /* Like cublasDgemvBatched() but with variable-sized blocks
68
69 Input Parameters:
70 + n - number of rows of the local matrix
71 . bs - [nblocks+1], prefix sum of bsizes[]
72 . bs2 - [nblocks+1], prefix sum of squares of bsizes[]
73 . matIdx - [n], store block/matrix index for each row
74 . A - blocks of the matrix back to back in column-major order
75 . x - the input vector
76 - transpose - whether it is MatMult for Ax (false) or MatMultTranspose for A^Tx (true)
77
78 Output Parameter:
79 . y - the output vector
80 */
MatMultBatched(PetscInt n,const PetscInt * bs,const PetscInt * bs2,const PetscInt * matIdx,const MatScalar * A,const PetscScalar * x,PetscScalar * y,PetscBool transpose)81 __global__ static void MatMultBatched(PetscInt n, const PetscInt *bs, const PetscInt *bs2, const PetscInt *matIdx, const MatScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose)
82 {
83 const PetscInt gridSize = gridDim.x * blockDim.x;
84 PetscInt tid = blockIdx.x * blockDim.x + threadIdx.x;
85 PetscInt i, j, k, m;
86
87 /* One row per thread. The blocks/matrices are stored in column-major order */
88 for (; tid < n; tid += gridSize) {
89 k = matIdx[tid]; /* k-th block */
90 m = bs[k + 1] - bs[k]; /* block size of the k-th block */
91 i = tid - bs[k]; /* i-th row of the block */
92 A += bs2[k] + i * (transpose ? m : 1); /* advance A to the first entry of i-th row */
93 x += bs[k];
94 y += bs[k];
95
96 y[i] = 0.0;
97 for (j = 0; j < m; j++) {
98 y[i] += A[0] * x[j];
99 A += (transpose ? 1 : m);
100 }
101 }
102 }
103
PCApplyOrTranspose_VPBJacobi_CUDA(PC pc,Vec x,Vec y,PetscBool transpose)104 static PetscErrorCode PCApplyOrTranspose_VPBJacobi_CUDA(PC pc, Vec x, Vec y, PetscBool transpose)
105 {
106 PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
107 PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
108 const PetscScalar *xx;
109 PetscScalar *yy;
110 PetscInt n;
111
112 PetscFunctionBegin;
113 PetscCall(PetscLogGpuTimeBegin());
114 if (PetscDefined(USE_DEBUG)) {
115 PetscBool isCuda;
116 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &isCuda, VECSEQCUDA, VECMPICUDA, ""));
117 if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &isCuda, VECSEQCUDA, VECMPICUDA, ""));
118 PetscCheck(isCuda, PETSC_COMM_SELF, PETSC_ERR_SUP, "PC: applying a CUDA pmat to non-cuda vectors");
119 }
120
121 PetscCall(MatGetLocalSize(pc->pmat, &n, NULL));
122 if (n) {
123 PetscInt gridSize = PetscMin((n + 255) / 256, 2147483647); /* <= 2^31-1 */
124 PetscCall(VecCUDAGetArrayRead(x, &xx));
125 PetscCall(VecCUDAGetArrayWrite(y, &yy));
126 MatMultBatched<<<gridSize, 256>>>(n, pcuda->bs_d, pcuda->bs2_d, pcuda->matIdx_d, pcuda->diag_d, xx, yy, transpose);
127 PetscCallCUDA(cudaGetLastError());
128 PetscCall(VecCUDARestoreArrayRead(x, &xx));
129 PetscCall(VecCUDARestoreArrayWrite(y, &yy));
130 }
131 PetscCall(PetscLogGpuFlops(pcuda->nsize * 2)); /* FMA on entries in all blocks */
132 PetscCall(PetscLogGpuTimeEnd());
133 PetscFunctionReturn(PETSC_SUCCESS);
134 }
135
PCApply_VPBJacobi_CUDA(PC pc,Vec x,Vec y)136 static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc, Vec x, Vec y)
137 {
138 PetscFunctionBegin;
139 PetscCall(PCApplyOrTranspose_VPBJacobi_CUDA(pc, x, y, PETSC_FALSE));
140 PetscFunctionReturn(PETSC_SUCCESS);
141 }
142
PCApplyTranspose_VPBJacobi_CUDA(PC pc,Vec x,Vec y)143 static PetscErrorCode PCApplyTranspose_VPBJacobi_CUDA(PC pc, Vec x, Vec y)
144 {
145 PetscFunctionBegin;
146 PetscCall(PCApplyOrTranspose_VPBJacobi_CUDA(pc, x, y, PETSC_TRUE));
147 PetscFunctionReturn(PETSC_SUCCESS);
148 }
149
PCDestroy_VPBJacobi_CUDA(PC pc)150 static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc)
151 {
152 PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
153
154 PetscFunctionBegin;
155 PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA *>(jac->spptr));
156 PetscCall(PCDestroy_VPBJacobi(pc));
157 PetscFunctionReturn(PETSC_SUCCESS);
158 }
159
PCSetUp_VPBJacobi_CUDA(PC pc,Mat diagVPB)160 PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc, Mat diagVPB)
161 {
162 PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
163 PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
164 PetscInt i, n, nblocks, nsize = 0;
165 const PetscInt *bsizes;
166
167 PetscFunctionBegin;
168 PetscCall(PCSetUp_VPBJacobi_Host(pc, diagVPB)); /* Compute the inverse on host now. Might worth doing it on device directly */
169 PetscCall(MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes));
170 for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i];
171 PetscCall(MatGetLocalSize(pc->pmat, &n, NULL));
172
173 /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
174 if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) {
175 PetscCallCXX(delete pcuda);
176 pcuda = nullptr;
177 }
178
179 if (!pcuda) { /* allocate the struct along with the helper arrays from the scratch */
180 PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n, nblocks, nsize, bsizes, jac->diag));
181 } else { /* update the value only */
182 PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes, jac->diag));
183 }
184
185 pc->ops->apply = PCApply_VPBJacobi_CUDA;
186 pc->ops->applytranspose = PCApplyTranspose_VPBJacobi_CUDA;
187 pc->ops->destroy = PCDestroy_VPBJacobi_CUDA;
188 PetscFunctionReturn(PETSC_SUCCESS);
189 }
190